diff --git a/hyak_scripts/filter.py b/hyak_scripts/filter.py new file mode 100644 index 0000000..1c0516e --- /dev/null +++ b/hyak_scripts/filter.py @@ -0,0 +1,25 @@ +import dask +import dask.dataframe as dd +import numpy as np +import pyarrow.parquet as pq +import pandas as pd + +metadata_path = '/mmfs1/gscratch/krishna/mayank/dfn/dfn-medium/metadata' + +indices = np.load('/mmfs1/gscratch/krishna/mayank/dfn/indicies/datacomp_medium_dfn_20m_inds.npy') +all_uids = set([f'{uid[0]:016x}{uid[1]:016x}' for uid in indices]) + +fp = pq.read_table( + source=metadata_path, + use_threads=True, + filters=[('uid', 'in', all_uids)] +) +schema = pq.read_schema("/mmfs1/gscratch/krishna/mayank/dfn/dfn-medium/metadata/0a4a1e10352ec4366858927c33873cdc.parquet") +df = fp.to_pandas() + +# save df to disk +dd.from_pandas(df, chunksize=500000).to_parquet( + '/mmfs1/gscratch/krishna/mayank/dfn/dfn-medium/metadata_filtered', + schema=schema + ) + diff --git a/hyak_scripts/upload.py b/hyak_scripts/upload.py new file mode 100644 index 0000000..b602bae --- /dev/null +++ b/hyak_scripts/upload.py @@ -0,0 +1,40 @@ +import img2dataset +import argparse +from cloudpathlib import CloudPath + +def upload(source, destination): + img2dataset.download( + processes_count=16, + thread_count=128, + image_size=512, + resize_mode="keep_ratio_largest", + url_list = str(source), + resize_only_if_bigger=True, + encode_format="jpg", + output_format="webdataset", + retries=3, + enable_wandb=False, + wandb_project="dataupload", + skip_reencode=True, + output_folder=destination, + input_format="parquet", + url_col="url", + caption_col="text", + number_sample_per_shard=10000, + distributor="multiprocessing", + save_additional_columns=["uid"], + oom_shard_count=8, + bbox_col="face_bboxes", + ) + print("download complete!!") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--source", type=str, required=True) + parser.add_argument("--destination", type=str, required=True) + args = parser.parse_args() + # destination = CloudPath(args.destination) + upload(args.source, args.destination) + +# python3 upload.py --source /mmfs1/gscratch/krishna/mayank/dfn/dfn-medium/metadata_filtered --destination gs://dfn_medium \ No newline at end of file