@@ -179,16 +179,14 @@ def post_fasta_batches(
179
179
metadata_file : str ,
180
180
config : Config ,
181
181
params : dict [str , str ],
182
- chunk_size = 60000 ,
182
+ chunk_size = 1000 ,
183
183
) -> requests .Response :
184
184
"""Chunks metadata files, joins with sequences and submits each chunk via POST."""
185
- df = pd .read_csv (metadata_file , sep = "\t " )
186
- logger .info (df .columns )
187
- submission_ids = df ["submissionId" ].tolist ()
188
185
sequences_output_file = "results/batch_sequences.fasta"
189
186
metadata_output_file = "results/batch_metadata.tsv"
190
187
191
- def submit (metadata_output_file , sequences_output_file , batch_num ):
188
+ def submit (metadata_output_file , sequences_output_file , number_of_submissions ):
189
+ batch_num = - (number_of_submissions // - chunk_size ) # ceiling division
192
190
with (
193
191
open (metadata_output_file , "rb" ) as metadata_ ,
194
192
open (sequences_output_file , "rb" ) as fasta_ ,
@@ -199,68 +197,85 @@ def submit(metadata_output_file, sequences_output_file, batch_num):
199
197
}
200
198
response = make_request (HTTPMethod .POST , url , config , params = params , files = files )
201
199
logger .info (f"Batch { batch_num } Response: { response .status_code } " )
200
+ if response .status_code != 200 :
201
+ logger .error (f"Error in batch { batch_num } : { response .text } " )
202
202
return response
203
203
204
- def write_csv (submission_id_chunk ):
205
- metadata = df [df ["submissionId" ].isin (submission_id_chunk )]
206
- metadata .to_csv (metadata_output_file , sep = "\t " , index = False , float_format = "%.0f" )
204
+ return response
205
+
206
+ def delete_batch_files (fasta_output , metadata_output ):
207
+ fasta_output .seek (0 )
208
+ fasta_output .truncate ()
209
+ metadata_output .seek (0 )
210
+ metadata_output .truncate ()
207
211
208
- current_submission_id = None
209
- number_of_submissions = 0
212
+ number_of_submissions = - 1
210
213
submission_id_chunk = []
214
+ fasta_submission_id = None
215
+ fasta_header = None
216
+
211
217
with (
212
218
open (fasta_file , encoding = "utf-8" ) as fasta_file_stream ,
213
- open (sequences_output_file , "a" , encoding = "utf-8" ) as output ,
219
+ open (sequences_output_file , "a" , encoding = "utf-8" ) as fasta_output ,
220
+ open (metadata_file , encoding = "utf-8" ) as metadata_file_stream ,
221
+ open (metadata_output_file , "a" , encoding = "utf-8" ) as metadata_output ,
214
222
):
215
- for line in fasta_file_stream :
216
- if line .startswith (">" ):
217
- header = line .strip ()
218
- if config .segmented :
219
- submission_id = "_" .join (header [1 :].split ("_" )[:- 1 ])
220
- else :
221
- submission_id = header [1 :]
222
- if submission_id == current_submission_id :
223
- continue
224
- if current_submission_id and submission_id < current_submission_id :
225
- msg = "Fasta file is not sorted by submissionId"
226
- logger .error (msg )
227
- raise ValueError (msg )
228
-
229
- number_of_submissions += 1
230
- current_submission_id = submission_id
231
- submission_id_chunk .append (submission_id )
232
- if submission_id not in submission_ids :
233
- msg = f"SubmissionId { submission_id } not found in metadata"
234
- logger .error (msg )
235
- raise ValueError (msg )
223
+ for record in metadata_file_stream :
224
+ number_of_submissions += 1
225
+ metadata_output .write (record )
226
+ if number_of_submissions == 0 :
227
+ # get column index of submissionId
228
+ print (record .split ("\t " ))
229
+ header_index = record .split ("\t " ).index ("submissionId" )
236
230
continue
237
231
232
+ metadata_submission_id = record .split ("\t " )[header_index ]
233
+
234
+ if fasta_submission_id and metadata_submission_id != fasta_submission_id :
235
+ msg = f"Fasta SubmissionId { fasta_submission_id } not in correct order in metadata"
236
+ logger .error (msg )
237
+ raise ValueError (msg )
238
+
239
+ searching = True
240
+
241
+ while searching :
242
+ line = fasta_file_stream .readline ()
243
+ if not line :
244
+ searching = False
245
+ break
246
+ if line .startswith (">" ):
247
+ header = line .strip ()
248
+ fasta_header = header
249
+ if config .segmented :
250
+ submission_id = "_" .join (header [1 :].split ("_" )[:- 1 ])
251
+ else :
252
+ submission_id = header [1 :]
253
+ if submission_id == metadata_submission_id :
254
+ continue
255
+ if submission_id < metadata_submission_id :
256
+ msg = "Fasta file is not sorted by submissionId"
257
+ logger .error (msg )
258
+ raise ValueError (msg )
259
+
260
+ fasta_submission_id = submission_id
261
+ submission_id_chunk .append (submission_id )
262
+ searching = False
263
+ break
264
+
265
+ # add to sequences file
266
+ fasta_output .write (fasta_header + "\n " )
267
+ fasta_output .write (line )
268
+
238
269
if number_of_submissions % chunk_size == 0 :
239
- # submit sequences and metadata
240
- batch_num = number_of_submissions // chunk_size
241
- write_csv (submission_id_chunk )
242
- response = submit (metadata_output_file , sequences_output_file , batch_num )
243
- if response .status_code != 200 :
244
- logger .error (f"Error in batch { batch_num + 1 } : { response .text } " )
245
- return response
246
-
247
- # delete the contents of sequences_output_file
248
- output .seek (0 )
249
- output .truncate ()
270
+ response = submit (
271
+ metadata_output_file , sequences_output_file , number_of_submissions
272
+ )
273
+ delete_batch_files (fasta_output , metadata_output )
250
274
submission_id_chunk = []
251
275
252
- # add to sequences file
253
- output .write (header + "\n " )
254
- output .write (line + "\n " )
255
-
256
276
if submission_id_chunk :
257
277
# submit the last chunk
258
- write_csv (submission_id_chunk )
259
- batch_num = int (number_of_submissions // chunk_size ) + 1
260
- response = submit (metadata_output_file , sequences_output_file , batch_num )
261
- if response .status_code != 200 :
262
- logger .error (f"Error in batch { batch_num + 1 } : { response .text } " )
263
- return response
278
+ response = submit (metadata_output_file , sequences_output_file , number_of_submissions )
264
279
265
280
return response
266
281
@@ -269,7 +284,7 @@ def submit_or_revise(
269
284
metadata , sequences , config : Config , group_id , mode = Literal ["submit" , "revise" ]
270
285
):
271
286
"""
272
- Submit/revise data to Loculus.
287
+ Submit/revise data to Loculus -requires metadata and sequences sorted by submissionId .
273
288
"""
274
289
logging_strings : dict [str , str ]
275
290
endpoint : str
0 commit comments