Skip to content

Commit 0df572e

Browse files
author
ProgrammingIncluded
committed
Use running average calculations for offset
1 parent 9704f6f commit 0df572e

File tree

1 file changed

+13
-4
lines changed

1 file changed

+13
-4
lines changed

birdwatch.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
from selenium.webdriver.chrome.service import Service
2626
from webdriver_manager.chrome import ChromeDriverManager
2727
from selenium.webdriver.support.ui import WebDriverWait
28-
from selenium.common.exceptions import WebDriverException
2928

3029
SCRAPE_N_TWEETS = 20
3130
IS_DEBUG = False
@@ -78,6 +77,14 @@ def remove_elements(driver, elements, remove_parent=True):
7877
}}
7978
""".format(",".join(elements)))
8079

80+
def calc_average(lst):
81+
if len(lst) < 4:
82+
return sum(lst) / len(lst)
83+
84+
cut_off = int(len(lst) * 0.25)
85+
s = sorted(lst)[cut_off:len(lst) - cut_off]
86+
return sum(s) / len(s)
87+
8188
def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=SCRAPE_N_TWEETS, bio_only=False):
8289
driver.get(url)
8390
state = ""
@@ -142,6 +149,7 @@ def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=
142149
tweets_tracker = set()
143150
boosted_tracker = set()
144151
estimated_height = 0
152+
height_diffs = []
145153
div_track = set()
146154
try:
147155
last_height = 0
@@ -178,6 +186,7 @@ def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=
178186
height = float(driver.execute_script("return window.scrollTop || window.pageYOffset;"))
179187
if height < estimated_height:
180188
continue
189+
height_diffs.append(height - estimated_height)
181190
estimated_height = height
182191

183192
tm = {"id": id_tracker}
@@ -226,8 +235,8 @@ def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=
226235
break
227236

228237
# Scroll!
238+
driver.execute_script("window.scrollTo(0, {});".format(estimated_height + calc_average(height_diffs)))
229239
time.sleep(random.uniform(load_times, load_times + 2))
230-
driver.execute_script("window.scrollTo(0, {});".format(estimated_height + 10))
231240
new_height = driver.execute_script("return document.body.scrollHeight")
232241
if new_height == last_height:
233242
break
@@ -247,7 +256,7 @@ def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=
247256
def parse_args():
248257
parser = argparse.ArgumentParser(description="Process Twitter Account Metadata")
249258
parser.add_argument("--force", "-f", help="Force re-download everything. WARNING, will delete outputs.", action="store_true")
250-
parser.add_argument("--posts", "-p", help="Max number of posts to screenshot.", default=SCRAPE_N_TWEETS)
259+
parser.add_argument("--posts", "-p", help="Max number of posts to screenshot.", default=SCRAPE_N_TWEETS, type=int)
251260
parser.add_argument("--bio-only", "-b", help="Only store bio, no snapshots or tweets.", action="store_true")
252261
parser.add_argument("--debug", help="Print debug output.", action="store_true")
253262
parser.add_argument("--login", help="Prompt user login to remove tweet limit..", action="store_true")
@@ -265,7 +274,7 @@ def main():
265274

266275
output_folder = "snapshots"
267276
os.makedirs(output_folder, exist_ok=True)
268-
extra_args = {"force": args.force, "bio_only": args.bio_only, "load_times": args.scroll_load_time}
277+
extra_args = {"force": args.force, "bio_only": args.bio_only, "load_times": args.scroll_load_time, "number_posts_to_cap": args.posts}
269278

270279
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
271280
if args.login:

0 commit comments

Comments
 (0)