-
Notifications
You must be signed in to change notification settings - Fork 57
/
Copy pathhfdataset.py
84 lines (70 loc) · 2.78 KB
/
hfdataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import requests
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm
from datasets import load_dataset
#from datasets import list_datasets
from huggingface_hub import list_datasets
import pandas as pd
def fetch_issues(
owner="huggingface",
repo="datasets",
num_issues=10_000,
rate_limit=5_000,
issues_path=Path("."),
):
if not issues_path.is_dir():
issues_path.mkdir(exist_ok=True)
batch = []
all_issues = []
per_page = 100 # Number of issues to return per page
num_pages = math.ceil(num_issues / per_page)
base_url = "https://api.github.com/repos"
for page in tqdm(range(num_pages)):
# Query with state=all to get both open and closed issues
query = f"issues?page={page}&per_page={per_page}&state=all"
issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
batch.extend(issues.json())
if len(batch) > rate_limit and len(all_issues) < num_issues:
all_issues.extend(batch)
batch = [] # Flush batch for next time period
print(f"Reached GitHub rate limit. Sleeping for one hour ...")
time.sleep(60 * 60 + 1)
all_issues.extend(batch)
df = pd.DataFrame.from_records(all_issues)
df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
print(
f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
)
if __name__ == "__main__":
all_datasets = list_datasets()
all_datasetids = [dataset.id for dataset in all_datasets]
print(all_datasetids)
print(f"There are {len(all_datasetids)} datasets currently available on the Hub")
print(f"The first 10 are: {all_datasets[:10]}")
# Load a dataset and print the first example in the training set
squad_dataset = load_dataset('squad')
print(squad_dataset['train'][0])
# dataset_id="amazon_reviews_multi"
# dataset_config="all_languages"
# dataset = load_dataset(dataset_id,dataset_config)
emotions = load_dataset("emotion")
print(emotions)
train_ds = emotions["train"]
print(len(train_ds))
print(train_ds[0])#single example
print(train_ds.column_names)#print column names
print(train_ds.features)
#Datasets is based on Apache Arrow, which defines a typed columnar format that is more memory efficient than native Python.
emotions.set_format(type="pandas")
df = emotions["train"][:]
print(df.head())
url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)
print(response.status_code)
print(response.json())
fetch_issues()
issues_dataset = load_dataset("json", data_files="datasets-issues.jsonl", split="train")
issues_dataset