Skip to content

Commit 9704f6f

Browse files
author
ProgrammingIncluded
committed
v0.2.0: Enable Login and New Scroll Method
1 parent 3967913 commit 9704f6f

File tree

2 files changed

+28
-52
lines changed

2 files changed

+28
-52
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
# CHANGELOG
22

3+
## 0.2.0: Enable Login and New Scroll Method
4+
5+
* Add `--login` for giving users login
6+
* New aglorithm for scrolling down content to prevent duplications
7+
38
## 0.1.1: Better Folder Renames
49

510
* Add some code-cleanup

birdwatch.py

Lines changed: 23 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,9 @@ def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=
100100
metadata["name"], metadata["username"] = ensures_or(lambda: driver.find_element(By.CSS_SELECTOR,'div[data-testid="UserName"]').text.split('\n'), ("NULL", "NULL"))
101101
metadata["location"] = ensures_or(lambda: driver.find_element(By.CSS_SELECTOR,'span[data-testid="UserLocation"]').text)
102102
metadata["website"] = ensures_or(lambda: driver.find_element(By.CSS_SELECTOR,'a[data-testid="UserUrl"]').text)
103-
metadata["join_date"] = ensures_or(driver.find_element(By.CSS_SELECTOR,'span[data-testid="UserJoinDate"]').text)
104-
metadata["following"] = ensures_or(driver.find_element(By.XPATH, "//span[contains(text(), 'Following')]/ancestor::a/span").text)
105-
metadata["followers"] = ensures_or(driver.find_element(By.XPATH, "//span[contains(text(), 'Followers')]/ancestor::a/span").text)
103+
metadata["join_date"] = ensures_or(lambda: driver.find_element(By.CSS_SELECTOR,'span[data-testid="UserJoinDate"]').text)
104+
metadata["following"] = ensures_or(lambda: driver.find_element(By.XPATH, "//span[contains(text(), 'Following')]/ancestor::a/span").text)
105+
metadata["followers"] = ensures_or(lambda: driver.find_element(By.XPATH, "//span[contains(text(), 'Followers')]/ancestor::a/span").text)
106106

107107
if metadata.get("username", "NULL") == "NULL":
108108
raise RuntimeError("Fatal error, unable to resolve username {}".format(metadata))
@@ -115,7 +115,7 @@ def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=
115115
if not force and os.path.exists(fpath):
116116
print("Folder already exists, skipping: {}".format(fpath))
117117
return
118-
elif force:
118+
elif force and os.path.exists(fpath):
119119
shutil.rmtree(fpath)
120120

121121
os.makedirs(fpath)
@@ -141,12 +141,12 @@ def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=
141141
last_id_count = 0
142142
tweets_tracker = set()
143143
boosted_tracker = set()
144-
div_id_track = set()
145144
estimated_height = 0
145+
div_track = set()
146146
try:
147147
last_height = 0
148148
new_height = 0
149-
temp_load_times = load_times
149+
time.sleep(random.uniform(load_times, load_times + 2))
150150
while True:
151151
if id_tracker >= number_posts_to_cap - 1:
152152
break
@@ -162,50 +162,23 @@ def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=
162162

163163
tweets = driver.find_elements(By.CSS_SELECTOR, '[data-testid="tweet"]')
164164
for tweet in tweets:
165-
# Enables backwards scrolling and looking at existing divs.
166-
div_id = tweet.get_attribute("aria-labelledby")
167-
if div_id and div_id in div_id_track:
168-
print("Div track is working??")
169-
continue
170-
print("DIV: {}".format(div_id))
171-
div_id_track.add(div_id)
172-
173165
# Try to scroll there first and retry 2x load times before giving up.
174166
# Then bump up global load times by one.
175-
scrolled = False
176-
limit = temp_load_times + 3
177-
for lt in range(temp_load_times, limit):
178-
try:
179-
driver.execute_script("return arguments[0].scrollIntoView();", tweet)
180-
driver.execute_script("window.scrollTo(0, window.pageYOffset - 50);")
181-
time.sleep(1)
182-
scrolled = True
183-
# Reset load times
184-
temp_load_times = load_times
185-
break
186-
except selenium.common.exceptions.StaleElementReferenceException as e:
187-
if lt < limit - 1:
188-
print("Loading times are getting harder. Bumping wait time for next iteration.")
189-
print_debug("Load time: {}".format(lt))
190-
try:
191-
WebDriverWait(driver, 20).until(EC.presence_of_element_located(tweet))
192-
driver.execute_script("window.scrollTo(0, window.pageYOffset - 20);")
193-
except:
194-
pass
195-
elif temp_load_times >= load_times + 2:
196-
print("Even after bumping global load times, still failing. Aborting task.")
197-
raise e
198-
else:
199-
print("This tweet just ain't load'in. Bumping global load times to: {}".format(temp_load_times))
200-
# Scroll backwards.
201-
driver.execute_script("window.scrollTo(0, window.pageYOffset - 10);")
202-
time.sleep(lt)
203-
temp_load_times += 1
204-
break
205-
206-
if not scrolled:
207-
print("SKIPPING")
208-
break
167+
try:
168+
div_id = tweet.get_attribute("aria-labelledby")
169+
if div_id in div_track:
170+
continue
171+
172+
div_track.add(div_id)
173+
driver.execute_script("return arguments[0].scrollIntoView();", tweet)
174+
driver.execute_script("window.scrollTo(0, window.pageYOffset - 50);")
175+
except:
176+
continue
177+
178+
height = float(driver.execute_script("return window.scrollTop || window.pageYOffset;"))
179+
if height < estimated_height:
180+
continue
181+
estimated_height = height
209182

210183
tm = {"id": id_tracker}
211184
tm["tag_text"] = ensures_or(lambda: tweet.find_element(By.CSS_SELECTOR,'div[data-testid="User-Names"]').text)
@@ -237,8 +210,6 @@ def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=
237210
print("ARLEAD {}".format(dtm))
238211
continue
239212

240-
estimated_height += tweet.size["height"]
241-
242213
try:
243214
# Try to remove elements before screenshot
244215
remove_elements(driver, ["sheetDialog", "confirmationSheetDialog", "mask"])
@@ -255,16 +226,16 @@ def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=
255226
break
256227

257228
# Scroll!
258-
driver.execute_script("window.scrollTo(0, {});".format(estimated_height + 100))
259229
time.sleep(random.uniform(load_times, load_times + 2))
230+
driver.execute_script("window.scrollTo(0, {});".format(estimated_height + 10))
260231
new_height = driver.execute_script("return document.body.scrollHeight")
261232
if new_height == last_height:
262233
break
263234
last_height = new_height
235+
264236
except selenium.common.exceptions.StaleElementReferenceException as e:
265237
print("Tweet limit reached, for {} unable to fetch more data. Authentication is required.".format(username))
266238
print("Or you can try to bump loading times.")
267-
input()
268239
raise e
269240
except Exception as e:
270241
raise e

0 commit comments

Comments
 (0)