25
25
from selenium .webdriver .chrome .service import Service
26
26
from webdriver_manager .chrome import ChromeDriverManager
27
27
from selenium .webdriver .support .ui import WebDriverWait
28
- from selenium .common .exceptions import WebDriverException
29
28
30
29
SCRAPE_N_TWEETS = 20
31
30
IS_DEBUG = False
@@ -78,6 +77,14 @@ def remove_elements(driver, elements, remove_parent=True):
78
77
}}
79
78
""" .format ("," .join (elements )))
80
79
80
+ def calc_average (lst ):
81
+ if len (lst ) < 4 :
82
+ return sum (lst ) / len (lst )
83
+
84
+ cut_off = int (len (lst ) * 0.25 )
85
+ s = sorted (lst )[cut_off :len (lst ) - cut_off ]
86
+ return sum (s ) / len (s )
87
+
81
88
def fetch_html (driver , url , fpath , load_times , force = False , number_posts_to_cap = SCRAPE_N_TWEETS , bio_only = False ):
82
89
driver .get (url )
83
90
state = ""
@@ -142,6 +149,7 @@ def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=
142
149
tweets_tracker = set ()
143
150
boosted_tracker = set ()
144
151
estimated_height = 0
152
+ height_diffs = []
145
153
div_track = set ()
146
154
try :
147
155
last_height = 0
@@ -178,6 +186,7 @@ def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=
178
186
height = float (driver .execute_script ("return window.scrollTop || window.pageYOffset;" ))
179
187
if height < estimated_height :
180
188
continue
189
+ height_diffs .append (height - estimated_height )
181
190
estimated_height = height
182
191
183
192
tm = {"id" : id_tracker }
@@ -226,8 +235,8 @@ def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=
226
235
break
227
236
228
237
# Scroll!
238
+ driver .execute_script ("window.scrollTo(0, {});" .format (estimated_height + calc_average (height_diffs )))
229
239
time .sleep (random .uniform (load_times , load_times + 2 ))
230
- driver .execute_script ("window.scrollTo(0, {});" .format (estimated_height + 10 ))
231
240
new_height = driver .execute_script ("return document.body.scrollHeight" )
232
241
if new_height == last_height :
233
242
break
@@ -247,7 +256,7 @@ def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=
247
256
def parse_args ():
248
257
parser = argparse .ArgumentParser (description = "Process Twitter Account Metadata" )
249
258
parser .add_argument ("--force" , "-f" , help = "Force re-download everything. WARNING, will delete outputs." , action = "store_true" )
250
- parser .add_argument ("--posts" , "-p" , help = "Max number of posts to screenshot." , default = SCRAPE_N_TWEETS )
259
+ parser .add_argument ("--posts" , "-p" , help = "Max number of posts to screenshot." , default = SCRAPE_N_TWEETS , type = int )
251
260
parser .add_argument ("--bio-only" , "-b" , help = "Only store bio, no snapshots or tweets." , action = "store_true" )
252
261
parser .add_argument ("--debug" , help = "Print debug output." , action = "store_true" )
253
262
parser .add_argument ("--login" , help = "Prompt user login to remove tweet limit.." , action = "store_true" )
@@ -265,7 +274,7 @@ def main():
265
274
266
275
output_folder = "snapshots"
267
276
os .makedirs (output_folder , exist_ok = True )
268
- extra_args = {"force" : args .force , "bio_only" : args .bio_only , "load_times" : args .scroll_load_time }
277
+ extra_args = {"force" : args .force , "bio_only" : args .bio_only , "load_times" : args .scroll_load_time , "number_posts_to_cap" : args . posts }
269
278
270
279
driver = webdriver .Chrome (service = Service (ChromeDriverManager ().install ()))
271
280
if args .login :
0 commit comments