-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgxd.py
686 lines (566 loc) · 25.7 KB
/
gxd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
#!/usr/bin/python
#encoding:utf-8
# Usage:
#
# In a terminal/command line, cd to the directory where this file lives. Then...
#
# With embedded urls: ( download the hardcoded list of files in the 'files =' block below)
#
# python ./download-all-2020-06-03_11-40-23.py
#
# Download all files in a Metalink/CSV: (downloaded from ASF Vertex)
#
# python ./download-all-2020-06-03_11-40-23.py /path/to/downloads.metalink localmetalink.metalink localcsv.csv
#
# Compatibility: python >= 2.6.5, 2.7.5, 3.0
#
# If downloading from a trusted source with invalid SSL Certs, use --insecure to ignore
#
# For more information on bulk downloads, navigate to:
# https://asf.alaska.edu/how-to/data-tools/data-tools/#bulk_download
#
#
#
# This script was generated by the Alaska Satellite Facility's bulk download service.
# For more information on the service, navigate to:
# http://bulk-download.asf.alaska.edu/help
#
import sys
import csv
import os
import os.path
import tempfile
import shutil
import re
import base64
import time
import getpass
import ssl
import signal
import socket
import xml.etree.ElementTree as ET
#############
# This next block is a bunch of Python 2/3 compatability
try:
# Python 2.x Libs
from urllib2 import build_opener, install_opener, Request, urlopen, HTTPError
from urllib2 import URLError, HTTPSHandler, HTTPHandler, HTTPCookieProcessor
from cookielib import MozillaCookieJar
from StringIO import StringIO
except ImportError as e:
# Python 3.x Libs
from urllib.request import build_opener, install_opener, Request, urlopen
from urllib.request import HTTPHandler, HTTPSHandler, HTTPCookieProcessor
from urllib.error import HTTPError, URLError
from http.cookiejar import MozillaCookieJar
from io import StringIO
import pandas as pd
read = pd.read_csv('./gxd.csv')
mydict = []
for a in read['url']:
mydict.append(a)
# sList = str(mydict[::])
# print(sList)
###
# Global variables intended for cross-thread modification
abort = False
# 记录失败的文件list
dataSet = []
###
# A routine that handles trapped signals
def signal_handler(sig, frame):
global abort
sys.stderr.output("\n > Caught Signal. Exiting!\n")
abort = True # necessary to cause the program to stop
raise SystemExit # this will only abort the thread that the ctrl+c was caught in
class bulk_downloader:
def __init__(self):
# List of files to download
self.files = mydict
# Local stash of cookies so we don't always have to ask
self.cookie_jar_path = os.path.join(
os.path.expanduser('~'), ".bulkcookiejar.txt")
self.cookie_jar = None
self.asf_urs4 = {'url': 'https://urs.earthdata.nasa.gov/oauth/authorize',
'client': 'BO_n7nTIlMljdvU6kRRB3g',
'redir': 'https://auth.asf.alaska.edu/login'}
# Make sure we can write it our current directory
if os.access(os.getcwd(), os.W_OK) is False:
print("WARNING: Cannot write to current path! Check permissions for {0}".format(
os.getcwd()))
exit(-1)
# For SSL
self.context = {}
# Check if user handed in a Metalink or CSV:
if len(sys.argv) > 0:
download_files = []
input_files = []
for arg in sys.argv[1:]:
if arg == '--insecure':
try:
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
self.context['context'] = ctx
except AttributeError:
# Python 2.6 won't complain about SSL Validation
pass
elif arg.endswith('.metalink') or arg.endswith('.csv'):
if os.path.isfile(arg):
input_files.append(arg)
if arg.endswith('.metalink'):
new_files = self.process_metalink(arg)
else:
new_files = self.process_csv(arg)
if new_files is not None:
for file_url in (new_files):
download_files.append(file_url)
else:
print(
" > I cannot find the input file you specified: {0}".format(arg))
else:
print(
" > Command line argument '{0}' makes no sense, ignoring.".format(arg))
if len(input_files) > 0:
if len(download_files) > 0:
print(" > Processing {0} downloads from {1} input files. ".format(
len(download_files), len(input_files)))
self.files = download_files
else:
print(" > I see you asked me to download files from {0} input files, but they had no downloads!".format(
len(input_files)))
print(" > I'm super confused and exiting.")
exit(-1)
# Make sure cookie_jar is good to go!
self.get_cookie()
# summary
self.total_bytes = 0
self.total_time = 0
self.cnt = 0
self.success = []
self.failed = []
self.skipped = []
def getFailedList(self, fileName):
# fileName下载失败的文件地址
if [fileName] not in dataSet:
print('不存在当前元素,可以添加')
with open('failList.csv', 'a', encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow([fileName])
else:
print('已有元素,不添加')
def getfails(self):
with open('failList.csv', 'r', encoding="utf-8") as f:
csv_reader = csv.reader(f)
rows = [row for row in csv_reader]
return rows
# Get and validate a cookie
def get_cookie(self):
if os.path.isfile(self.cookie_jar_path):
self.cookie_jar = MozillaCookieJar()
self.cookie_jar.load(self.cookie_jar_path)
# make sure cookie is still valid
if self.check_cookie():
print(" > Re-using previous cookie jar.")
return True
else:
print(" > Could not validate old cookie Jar")
# We don't have a valid cookie, prompt user or creds
print("No existing URS cookie found, please enter Earthdata username & password:")
print("(Credentials will not be stored, saved or logged anywhere)")
# Keep trying 'till user gets the right U:P
while self.check_cookie() is False:
self.get_new_cookie()
return True
# Validate cookie before we begin
def check_cookie(self):
if self.cookie_jar is None:
print(" > Cookiejar is bunk: {0}".format(self.cookie_jar))
return False
# File we know is valid, used to validate cookie
file_check = 'https://urs.earthdata.nasa.gov/profile'
# Apply custom Redirect Hanlder
opener = build_opener(HTTPCookieProcessor(
self.cookie_jar), HTTPHandler(), HTTPSHandler(**self.context))
install_opener(opener)
# Attempt a HEAD request
request = Request(file_check)
request.get_method = lambda: 'HEAD'
try:
print(" > attempting to download {0}".format(file_check))
response = urlopen(request, timeout=30)
resp_code = response.getcode()
# Make sure we're logged in
if not self.check_cookie_is_logged_in(self.cookie_jar):
return False
# Save cookiejar
self.cookie_jar.save(self.cookie_jar_path)
except HTTPError:
# If we ge this error, again, it likely means the user has not agreed to current EULA
print("\nIMPORTANT: ")
print(
"Your user appears to lack permissions to download data from the ASF Datapool.")
print("\n\nNew users: you must first log into Vertex and accept the EULA. In addition, your Study Area must be set at Earthdata https://urs.earthdata.nasa.gov")
exit(-1)
# This return codes indicate the USER has not been approved to download the data
if resp_code in (300, 301, 302, 303):
try:
redir_url = response.info().getheader('Location')
except AttributeError:
redir_url = response.getheader('Location')
# Funky Test env:
if ("vertex-retired.daac.asf.alaska.edu" in redir_url and "test" in self.asf_urs4['redir']):
print("Cough, cough. It's dusty in this test env!")
return True
print(
"Redirect ({0}) occured, invalid cookie value!".format(resp_code))
return False
# These are successes!
if resp_code in (200, 307):
return True
return False
def get_new_cookie(self):
# Start by prompting user to input their credentials
# Another Python2/3 workaround
try:
new_username = raw_input("Username: ")
except NameError:
new_username = input("Username: ")
new_password = getpass.getpass(
prompt="Password (will not be displayed): ")
# Build URS4 Cookie request
auth_cookie_url = self.asf_urs4['url'] + '?client_id=' + self.asf_urs4['client'] + \
'&redirect_uri=' + \
self.asf_urs4['redir'] + '&response_type=code&state='
try:
# python2
user_pass = base64.b64encode(bytes(new_username+":"+new_password))
except TypeError:
# python3
user_pass = base64.b64encode(
bytes(new_username+":"+new_password, "utf-8"))
user_pass = user_pass.decode("utf-8")
# Authenticate against URS, grab all the cookies
self.cookie_jar = MozillaCookieJar()
opener = build_opener(HTTPCookieProcessor(
self.cookie_jar), HTTPHandler(), HTTPSHandler(**self.context))
request = Request(auth_cookie_url, headers={
"Authorization": "Basic {0}".format(user_pass)})
# Watch out cookie rejection!
try:
response = opener.open(request)
except HTTPError as e:
if "WWW-Authenticate" in e.headers and "Please enter your Earthdata Login credentials" in e.headers["WWW-Authenticate"]:
print(
" > Username and Password combo was not successful. Please try again.")
return False
else:
# If an error happens here, the user most likely has not confirmed EULA.
print("\nIMPORTANT: There was an error obtaining a download cookie!")
print(
"Your user appears to lack permission to download data from the ASF Datapool.")
print("\n\nNew users: you must first log into Vertex and accept the EULA. In addition, your Study Area must be set at Earthdata https://urs.earthdata.nasa.gov")
exit(-1)
except URLError as e:
print(
"\nIMPORTANT: There was a problem communicating with URS, unable to obtain cookie. ")
print("Try cookie generation later.")
exit(-1)
# Did we get a cookie?
if self.check_cookie_is_logged_in(self.cookie_jar):
# COOKIE SUCCESS!
self.cookie_jar.save(self.cookie_jar_path)
return True
# if we aren't successful generating the cookie, nothing will work. Stop here!
print("WARNING: Could not generate new cookie! Cannot proceed. Please try Username and Password again.")
print("Response was {0}.".format(response.getcode()))
print("\n\nNew users: you must first log into Vertex and accept the EULA. In addition, your Study Area must be set at Earthdata https://urs.earthdata.nasa.gov")
exit(-1)
# make sure we're logged into URS
def check_cookie_is_logged_in(self, cj):
for cookie in cj:
if cookie.name == 'urs_user_already_logged':
# Only get this cookie if we logged in successfully!
# get_new_cookie()
return True
return False
# Download the file
def download_file_with_cookiejar(self, url, file_count, total, recursion=False):
# see if we've already download this file and if it is that it is the correct size
download_file = os.path.basename(url).split('?')[0]
if os.path.isfile(download_file):
try:
request = Request(url)
request.get_method = lambda: 'HEAD'
response = urlopen(request, timeout=30)
remote_size = self.get_total_size(response)
# Check that we were able to derive a size.
if remote_size:
local_size = os.path.getsize(download_file)
if remote_size < (local_size+(local_size*.01)) and remote_size > (local_size-(local_size*.01)):
print(" > Download file {0} exists! \n > Skipping download of {1}. ".format(
download_file, url))
return None, None
# partial file size wasn't full file size, lets blow away the chunk and start again
print(" > Found {0} but it wasn't fully downloaded. Removing file and downloading again.".format(
download_file))
os.remove(download_file)
except ssl.CertificateError as e:
print(" > ERROR: {0}".format(e))
print(
" > Could not validate SSL Cert. You may be able to overcome this using the --insecure flag")
return False, None
except HTTPError as e:
if e.code == 401:
print(
" > IMPORTANT: Your user may not have permission to download this type of data!")
else:
print(
" > Unknown Error, Could not get file HEAD: {0}".format(e))
except URLError as e:
print("URL Error (from HEAD): {0}, {1}".format(e.reason, url))
if "ssl.c" in "{0}".format(e.reason):
print(
"IMPORTANT: Remote location may not be accepting your SSL configuration. This is a terminal error.")
return False, None
# attempt https connection
try:
request = Request(url)
response = urlopen(request, timeout=30)
# Watch for redirect
if response.geturl() != url:
# See if we were redirect BACK to URS for re-auth.
if 'https://urs.earthdata.nasa.gov/oauth/authorize' in response.geturl():
if recursion:
print(" > Entering seemingly endless auth loop. Aborting. ")
return False, None
# make this easier. If there is no app_type=401, add it
new_auth_url = response.geturl()
if "app_type" not in new_auth_url:
new_auth_url += "&app_type=401"
print(
" > While attempting to download {0}....".format(url))
print(" > Need to obtain new cookie from {0}".format(
new_auth_url))
old_cookies = [cookie.name for cookie in self.cookie_jar]
opener = build_opener(HTTPCookieProcessor(
self.cookie_jar), HTTPHandler(), HTTPSHandler(**self.context))
request = Request(new_auth_url)
try:
response = opener.open(request)
for cookie in self.cookie_jar:
if cookie.name not in old_cookies:
print(" > Saved new cookie: {0}".format(
cookie.name))
# A little hack to save session cookies
if cookie.discard:
cookie.expires = int(
time.time()) + 60*60*24*30
print(
" > Saving session Cookie that should have been discarded! ")
self.cookie_jar.save(
self.cookie_jar_path, ignore_discard=True, ignore_expires=True)
except HTTPError as e:
print("HTTP Error: {0}, {1}".format(e.code, url))
return False, None
# Okay, now we have more cookies! Lets try again, recursively!
print(" > Attempting download again with new cookies!")
return self.download_file_with_cookiejar(url, file_count, total, recursion=True)
print(
" > 'Temporary' Redirect download @ Remote archive:\n > {0}".format(response.geturl()))
# seems to be working
print("({0}/{1}) Downloading {2}".format(file_count, total, url))
# Open our local file for writing and build status bar
tf = tempfile.NamedTemporaryFile(mode='w+b', delete=False, dir='.')
self.chunk_read(response, tf, report_hook=self.chunk_report)
# Reset download status
sys.stdout.write('\n')
tempfile_name = tf.name
tf.close()
# handle errors
except HTTPError as e:
print("HTTP Error: {0}, {1}".format(e.code, url))
if e.code == 401:
print(
" > IMPORTANT: Your user does not have permission to download this type of data!")
if e.code == 403:
print(" > Got a 403 Error trying to download this file. ")
print(" > You MAY need to log in this app and agree to a EULA. ")
return False, None
except URLError as e:
print("URL Error (from GET): {0}, {1}, {2}".format(
e, e.reason, url))
if "ssl.c" in "{0}".format(e.reason):
print(
"IMPORTANT: Remote location may not be accepting your SSL configuration. This is a terminal error.")
return False, None
except socket.timeout as e:
print(" > timeout requesting: {0}; {1}".format(url, e))
return False, None
except ssl.CertificateError as e:
print(" > ERROR: {0}".format(e))
print(
" > Could not validate SSL Cert. You may be able to overcome this using the --insecure flag")
return False, None
# Return the file size
shutil.copy(tempfile_name, download_file)
os.remove(tempfile_name)
file_size = self.get_total_size(response)
actual_size = os.path.getsize(download_file)
if file_size is None:
# We were unable to calculate file size.
file_size = actual_size
return actual_size, file_size
def get_redirect_url_from_error(self, error):
find_redirect = re.compile(r"id=\"redir_link\"\s+href=\"(\S+)\"")
print("error file was: {}".format(error))
redirect_url = find_redirect.search(error)
if redirect_url:
print("Found: {0}".format(redirect_url.group(0)))
return (redirect_url.group(0))
return None
# chunk_report taken from http://stackoverflow.com/questions/2028517/python-urllib2-progress-hook
def chunk_report(self, bytes_so_far, file_size):
if file_size is not None:
percent = float(bytes_so_far) / file_size
percent = round(percent*100, 2)
sys.stdout.write(" > Downloaded %d of %d bytes (%0.2f%%)\r" %
(bytes_so_far, file_size, percent))
else:
# We couldn't figure out the size.
sys.stdout.write(
" > Downloaded %d of unknown Size\r" % (bytes_so_far))
# chunk_read modified from http://stackoverflow.com/questions/2028517/python-urllib2-progress-hook
def chunk_read(self, response, local_file, chunk_size=8192, report_hook=None):
file_size = self.get_total_size(response)
bytes_so_far = 0
while 1:
try:
chunk = response.read(chunk_size)
except:
sys.stdout.write("\n > There was an error reading data. \n")
break
try:
local_file.write(chunk)
except TypeError:
local_file.write(chunk.decode(local_file.encoding))
bytes_so_far += len(chunk)
if not chunk:
break
if report_hook:
report_hook(bytes_so_far, file_size)
return bytes_so_far
def get_total_size(self, response):
try:
file_size = response.info().getheader('Content-Length').strip()
except AttributeError:
try:
file_size = response.getheader('Content-Length').strip()
except AttributeError:
print("> Problem getting size")
return None
return int(file_size)
# Get download urls from a metalink file
def process_metalink(self, ml_file):
print("Processing metalink file: {0}".format(ml_file))
with open(ml_file, 'r') as ml:
xml = ml.read()
# Hack to remove annoying namespace
it = ET.iterparse(StringIO(xml))
for _, el in it:
if '}' in el.tag:
el.tag = el.tag.split('}', 1)[1] # strip all namespaces
root = it.root
dl_urls = []
ml_files = root.find('files')
for dl in ml_files:
dl_urls.append(dl.find('resources').find('url').text)
if len(dl_urls) > 0:
return dl_urls
else:
return None
# Get download urls from a csv file
def process_csv(self, csv_file):
print("Processing csv file: {0}".format(csv_file))
dl_urls = []
with open(csv_file, 'r') as csvf:
try:
csvr = csv.DictReader(csvf)
for row in csvr:
dl_urls.append(row['URL'])
except csv.Error as e:
print("WARNING: Could not parse file %s, line %d: %s. Skipping." % (
csv_file, csvr.line_num, e))
return None
except KeyError as e:
print(
"WARNING: Could not find URL column in file %s. Skipping." % (csv_file))
if len(dl_urls) > 0:
return dl_urls
else:
return None
# Download all the files in the list
def download_files(self):
for file_name in self.files:
# make sure we haven't ctrl+c'd or some other abort trap
if abort == True:
raise SystemExit
# download counter
self.cnt += 1
# set a timer
start = time.time()
# run download
size, total_size = self.download_file_with_cookiejar(
file_name, self.cnt, len(self.files))
# calculte rate
end = time.time()
# stats:
if size is None:
self.skipped.append(file_name)
# Check to see that the download didn't error and is the correct size
elif size is not False and (total_size < (size+(size*.01)) and total_size > (size-(size*.01))):
# Download was good!
elapsed = end - start
elapsed = 1.0 if elapsed < 1 else elapsed
rate = (size/1024**2)/elapsed
print(
"Downloaded {0}b in {1:.2f}secs, Average Rate: {2:.2f}MB/sec".format(size, elapsed, rate))
# add up metrics
self.total_bytes += size
self.total_time += elapsed
self.success.append({'file': file_name, 'size': size})
else:
print("There was a problem downloading {0}".format(file_name))
self.failed.append(file_name)
try:
dataSet = self.getfails()
self.getFailedList(file_name)
except IOError:
self.getFailedList(file_name)
def print_summary(self):
# Print summary:
print("\n\nDownload Summary ")
print("--------------------------------------------------------------------------------")
print(" Successes: {0} files, {1} bytes ".format(
len(self.success), self.total_bytes))
for success_file in self.success:
print(" - {0} {1:.2f}MB".format(
success_file['file'], (success_file['size']/1024.0**2)))
if len(self.failed) > 0:
print(" Failures: {0} files".format(len(self.failed)))
for failed_file in self.failed:
print(" - {0}".format(failed_file))
if len(self.skipped) > 0:
print(" Skipped: {0} files".format(len(self.skipped)))
for skipped_file in self.skipped:
print(" - {0}".format(skipped_file))
if len(self.success) > 0:
print(
" Average Rate: {0:.2f}MB/sec".format((self.total_bytes/1024.0**2)/self.total_time))
print("--------------------------------------------------------------------------------")
if __name__ == "__main__":
# Setup a signal trap for SIGINT (Ctrl+C)
signal.signal(signal.SIGINT, signal_handler)
downloader = bulk_downloader()
downloader.download_files()
downloader.print_summary()