Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions hyak_scripts/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import dask
import dask.dataframe as dd
import numpy as np
import pyarrow.parquet as pq
import pandas as pd

metadata_path = '/mmfs1/gscratch/krishna/mayank/dfn/dfn-medium/metadata'

indices = np.load('/mmfs1/gscratch/krishna/mayank/dfn/indicies/datacomp_medium_dfn_20m_inds.npy')
all_uids = set([f'{uid[0]:016x}{uid[1]:016x}' for uid in indices])

fp = pq.read_table(
source=metadata_path,
use_threads=True,
filters=[('uid', 'in', all_uids)]
)
schema = pq.read_schema("/mmfs1/gscratch/krishna/mayank/dfn/dfn-medium/metadata/0a4a1e10352ec4366858927c33873cdc.parquet")
df = fp.to_pandas()

# save df to disk
dd.from_pandas(df, chunksize=500000).to_parquet(
'/mmfs1/gscratch/krishna/mayank/dfn/dfn-medium/metadata_filtered',
schema=schema
)

40 changes: 40 additions & 0 deletions hyak_scripts/upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import img2dataset
import argparse
from cloudpathlib import CloudPath

def upload(source, destination):
img2dataset.download(
processes_count=16,
thread_count=128,
image_size=512,
resize_mode="keep_ratio_largest",
url_list = str(source),
resize_only_if_bigger=True,
encode_format="jpg",
output_format="webdataset",
retries=3,
enable_wandb=False,
wandb_project="dataupload",
skip_reencode=True,
output_folder=destination,
input_format="parquet",
url_col="url",
caption_col="text",
number_sample_per_shard=10000,
distributor="multiprocessing",
save_additional_columns=["uid"],
oom_shard_count=8,
bbox_col="face_bboxes",
)
print("download complete!!")


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--source", type=str, required=True)
parser.add_argument("--destination", type=str, required=True)
args = parser.parse_args()
# destination = CloudPath(args.destination)
upload(args.source, args.destination)

# python3 upload.py --source /mmfs1/gscratch/krishna/mayank/dfn/dfn-medium/metadata_filtered --destination gs://dfn_medium