Skip to content

Commit dde0e45

Browse files
committed
feat(ingest): also stream in metadata file for batch submissions
1 parent b3cab5c commit dde0e45

File tree

2 files changed

+89
-58
lines changed

2 files changed

+89
-58
lines changed

ingest/Snakefile

+20-4
Original file line numberDiff line numberDiff line change
@@ -588,14 +588,30 @@ rule sort_fasta:
588588
awk '/^>/ {{if (seq) print seq; seq=""; print; next}} {{seq = seq $0}} END {{if (seq) print seq}}' {input.sequences} | \
589589
paste - - | \
590590
sort -k1,1 | \
591-
tr "\t" "\n" > {output.sorted}
591+
tr "\\t" "\\n" > {output.sorted}
592+
"""
593+
594+
595+
rule sort_metadata:
596+
input:
597+
metadata="results/{basename}.tsv",
598+
output:
599+
sorted="results/{basename}_sorted.tsv",
600+
shell:
601+
"""
602+
columnNumber=$(awk -F'\\t' '{{for(i=1;i<=NF;i++) if($i=="submissionId") print i}}' {input.metadata});
603+
if [ ${{columnNumber}} ]; then
604+
(head -n 1 {input.metadata} && tail -n +2 {input.metadata} | sort -t$'\t' -k${{columnNumber}},${{columnNumber}}) > {output.sorted}
605+
else
606+
cat {input.metadata} > {output.sorted}
607+
fi
592608
"""
593609

594610

595611
rule submit:
596612
input:
597613
script="scripts/call_loculus.py",
598-
metadata="results/submit_metadata.tsv",
614+
metadata="results/submit_metadata_sorted.tsv",
599615
sequences="results/submit_sequences_sorted.fasta",
600616
config="results/config.yaml",
601617
output:
@@ -618,7 +634,7 @@ rule submit:
618634
rule revise:
619635
input:
620636
script="scripts/call_loculus.py",
621-
metadata="results/revise_metadata.tsv",
637+
metadata="results/revise_metadata_sorted.tsv",
622638
sequences="results/revise_sequences_sorted.fasta",
623639
config="results/config.yaml",
624640
output:
@@ -641,7 +657,7 @@ rule revise:
641657
rule regroup_and_revoke:
642658
input:
643659
script="scripts/call_loculus.py",
644-
metadata="results/metadata_to_submit_prior_to_revoke.tsv",
660+
metadata="results/metadata_to_submit_prior_to_revoke_sorted.tsv",
645661
sequences="results/sequences_to_submit_prior_to_revoke_sorted.fasta",
646662
map="results/to_revoke.json",
647663
config="results/config.yaml",

ingest/scripts/call_loculus.py

+69-54
Original file line numberDiff line numberDiff line change
@@ -179,16 +179,14 @@ def post_fasta_batches(
179179
metadata_file: str,
180180
config: Config,
181181
params: dict[str, str],
182-
chunk_size=60000,
182+
chunk_size=1000,
183183
) -> requests.Response:
184184
"""Chunks metadata files, joins with sequences and submits each chunk via POST."""
185-
df = pd.read_csv(metadata_file, sep="\t")
186-
logger.info(df.columns)
187-
submission_ids = df["submissionId"].tolist()
188185
sequences_output_file = "results/batch_sequences.fasta"
189186
metadata_output_file = "results/batch_metadata.tsv"
190187

191-
def submit(metadata_output_file, sequences_output_file, batch_num):
188+
def submit(metadata_output_file, sequences_output_file, number_of_submissions):
189+
batch_num = -(number_of_submissions // - chunk_size) # ceiling division
192190
with (
193191
open(metadata_output_file, "rb") as metadata_,
194192
open(sequences_output_file, "rb") as fasta_,
@@ -199,68 +197,85 @@ def submit(metadata_output_file, sequences_output_file, batch_num):
199197
}
200198
response = make_request(HTTPMethod.POST, url, config, params=params, files=files)
201199
logger.info(f"Batch {batch_num} Response: {response.status_code}")
200+
if response.status_code != 200:
201+
logger.error(f"Error in batch {batch_num}: {response.text}")
202202
return response
203203

204-
def write_csv(submission_id_chunk):
205-
metadata = df[df["submissionId"].isin(submission_id_chunk)]
206-
metadata.to_csv(metadata_output_file, sep="\t", index=False, float_format="%.0f")
204+
return response
205+
206+
def delete_batch_files(fasta_output, metadata_output):
207+
fasta_output.seek(0)
208+
fasta_output.truncate()
209+
metadata_output.seek(0)
210+
metadata_output.truncate()
207211

208-
current_submission_id = None
209-
number_of_submissions = 0
212+
number_of_submissions = -1
210213
submission_id_chunk = []
214+
fasta_submission_id = None
215+
fasta_header = None
216+
211217
with (
212218
open(fasta_file, encoding="utf-8") as fasta_file_stream,
213-
open(sequences_output_file, "a", encoding="utf-8") as output,
219+
open(sequences_output_file, "a", encoding="utf-8") as fasta_output,
220+
open(metadata_file, encoding="utf-8") as metadata_file_stream,
221+
open(metadata_output_file, "a", encoding="utf-8") as metadata_output,
214222
):
215-
for line in fasta_file_stream:
216-
if line.startswith(">"):
217-
header = line.strip()
218-
if config.segmented:
219-
submission_id = "_".join(header[1:].split("_")[:-1])
220-
else:
221-
submission_id = header[1:]
222-
if submission_id == current_submission_id:
223-
continue
224-
if current_submission_id and submission_id < current_submission_id:
225-
msg = "Fasta file is not sorted by submissionId"
226-
logger.error(msg)
227-
raise ValueError(msg)
228-
229-
number_of_submissions += 1
230-
current_submission_id = submission_id
231-
submission_id_chunk.append(submission_id)
232-
if submission_id not in submission_ids:
233-
msg = f"SubmissionId {submission_id} not found in metadata"
234-
logger.error(msg)
235-
raise ValueError(msg)
223+
for record in metadata_file_stream:
224+
number_of_submissions += 1
225+
metadata_output.write(record)
226+
if number_of_submissions == 0:
227+
# get column index of submissionId
228+
print(record.split("\t"))
229+
header_index = record.split("\t").index("submissionId")
236230
continue
237231

232+
metadata_submission_id = record.split("\t")[header_index]
233+
234+
if fasta_submission_id and metadata_submission_id != fasta_submission_id:
235+
msg = f"Fasta SubmissionId {fasta_submission_id} not in correct order in metadata"
236+
logger.error(msg)
237+
raise ValueError(msg)
238+
239+
searching = True
240+
241+
while searching:
242+
line = fasta_file_stream.readline()
243+
if not line:
244+
searching = False
245+
break
246+
if line.startswith(">"):
247+
header = line.strip()
248+
fasta_header = header
249+
if config.segmented:
250+
submission_id = "_".join(header[1:].split("_")[:-1])
251+
else:
252+
submission_id = header[1:]
253+
if submission_id == metadata_submission_id:
254+
continue
255+
if submission_id < metadata_submission_id:
256+
msg = "Fasta file is not sorted by submissionId"
257+
logger.error(msg)
258+
raise ValueError(msg)
259+
260+
fasta_submission_id = submission_id
261+
submission_id_chunk.append(submission_id)
262+
searching = False
263+
break
264+
265+
# add to sequences file
266+
fasta_output.write(fasta_header + "\n")
267+
fasta_output.write(line)
268+
238269
if number_of_submissions % chunk_size == 0:
239-
# submit sequences and metadata
240-
batch_num = number_of_submissions // chunk_size
241-
write_csv(submission_id_chunk)
242-
response = submit(metadata_output_file, sequences_output_file, batch_num)
243-
if response.status_code != 200:
244-
logger.error(f"Error in batch {batch_num + 1}: {response.text}")
245-
return response
246-
247-
# delete the contents of sequences_output_file
248-
output.seek(0)
249-
output.truncate()
270+
response = submit(
271+
metadata_output_file, sequences_output_file, number_of_submissions
272+
)
273+
delete_batch_files(fasta_output, metadata_output)
250274
submission_id_chunk = []
251275

252-
# add to sequences file
253-
output.write(header + "\n")
254-
output.write(line + "\n")
255-
256276
if submission_id_chunk:
257277
# submit the last chunk
258-
write_csv(submission_id_chunk)
259-
batch_num = int(number_of_submissions // chunk_size) + 1
260-
response = submit(metadata_output_file, sequences_output_file, batch_num)
261-
if response.status_code != 200:
262-
logger.error(f"Error in batch {batch_num + 1}: {response.text}")
263-
return response
278+
response = submit(metadata_output_file, sequences_output_file, number_of_submissions)
264279

265280
return response
266281

@@ -269,7 +284,7 @@ def submit_or_revise(
269284
metadata, sequences, config: Config, group_id, mode=Literal["submit", "revise"]
270285
):
271286
"""
272-
Submit/revise data to Loculus.
287+
Submit/revise data to Loculus -requires metadata and sequences sorted by submissionId.
273288
"""
274289
logging_strings: dict[str, str]
275290
endpoint: str

0 commit comments

Comments
 (0)