forked from dixudx/tumblr-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtumblr-photo-video-ripper.py
131 lines (107 loc) · 3.85 KB
/
tumblr-photo-video-ripper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import os
import sys
import time
import requests
import pytumblr
from queue import Queue
from threading import Thread
# Constants
TIMEOUT = 10
MEDIA_NUM = 50
THREADS = 10
DELAY = 1 # Delay in seconds between API requests
# Tumblr API credentials
CONSUMER_KEY = 'YOUR KEY HERE'
CONSUMER_SECRET = 'YOUR SECRET HERE'
OAUTH_TOKEN = 'YOUR OAUTH TOKEN HERE'
OAUTH_SECRET = 'YOUR OAUTH SECRET HERE'
# Initialize the Tumblr client
client = pytumblr.TumblrRestClient(
CONSUMER_KEY,
CONSUMER_SECRET,
OAUTH_TOKEN,
OAUTH_SECRET
)
class DownloadWorker(Thread):
def __init__(self, queue):
super().__init__()
self.queue = queue
def run(self):
while True:
photo_url, target_folder = self.queue.get()
self.download(photo_url, target_folder)
self.queue.task_done()
def download(self, url, folder):
try:
response = requests.get(url, timeout=TIMEOUT)
if response.status_code == 200:
filename = os.path.join(folder, os.path.basename(url))
with open(filename, 'wb') as f:
f.write(response.content)
print(f"Downloaded: {filename}")
else:
print(f"Failed to download {url}")
except Exception as e:
print(f"Error downloading {url}: {e}")
class CrawlerScheduler:
def __init__(self, sites):
self.sites = sites
self.queue = Queue()
self.scheduling()
def scheduling(self):
for _ in range(THREADS):
worker = DownloadWorker(self.queue)
worker.daemon = True
worker.start()
for site in self.sites:
self.download_media(site)
def download_media(self, site):
current_folder = os.getcwd()
target_folder = os.path.join(current_folder, site.strip())
os.makedirs(target_folder, exist_ok=True)
start = 0
total_downloaded = 0
while True:
try:
posts_data = client.posts(site, type='photo', offset=start, limit=MEDIA_NUM)
posts = posts_data.get('posts', [])
if not posts:
print(f"No more posts found for {site}")
break
num_posts = len(posts)
print(f"Retrieved {num_posts} posts from {site} starting at offset {start}")
for post in posts:
photo_urls = [photo['original_size']['url'] for photo in post.get('photos', [])]
for url in photo_urls:
self.queue.put((url, target_folder))
total_downloaded += num_posts
start += num_posts
if num_posts < MEDIA_NUM:
print(f"Finished downloading from {site}")
break
time.sleep(DELAY)
except Exception as e:
print(f"Error processing posts from {site}: {e}")
time.sleep(DELAY) # Retry after delay
continue
print(f"Total downloaded from {site}: {total_downloaded}")
def parse_sites(filename):
with open(filename, "r") as f:
return [extract_blog_name(site.strip()) for site in f.readlines() if site.strip()]
def extract_blog_name(url):
return url.split("https://www.tumblr.com/")[1].strip()
if __name__ == "__main__":
cur_dir = os.path.dirname(os.path.realpath(__file__))
if len(sys.argv) < 2:
filename = os.path.join(cur_dir, "sites.txt")
if os.path.exists(filename):
sites = parse_sites(filename)
else:
print("Please specify sites or create sites.txt.")
sys.exit(1)
else:
sites = [extract_blog_name(url) for url in sys.argv[1].split(",")]
if not sites:
print("No valid sites specified.")
sys.exit(1)
CrawlerScheduler(sites)