@@ -100,9 +100,9 @@ def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=
100
100
metadata ["name" ], metadata ["username" ] = ensures_or (lambda : driver .find_element (By .CSS_SELECTOR ,'div[data-testid="UserName"]' ).text .split ('\n ' ), ("NULL" , "NULL" ))
101
101
metadata ["location" ] = ensures_or (lambda : driver .find_element (By .CSS_SELECTOR ,'span[data-testid="UserLocation"]' ).text )
102
102
metadata ["website" ] = ensures_or (lambda : driver .find_element (By .CSS_SELECTOR ,'a[data-testid="UserUrl"]' ).text )
103
- metadata ["join_date" ] = ensures_or (driver .find_element (By .CSS_SELECTOR ,'span[data-testid="UserJoinDate"]' ).text )
104
- metadata ["following" ] = ensures_or (driver .find_element (By .XPATH , "//span[contains(text(), 'Following')]/ancestor::a/span" ).text )
105
- metadata ["followers" ] = ensures_or (driver .find_element (By .XPATH , "//span[contains(text(), 'Followers')]/ancestor::a/span" ).text )
103
+ metadata ["join_date" ] = ensures_or (lambda : driver .find_element (By .CSS_SELECTOR ,'span[data-testid="UserJoinDate"]' ).text )
104
+ metadata ["following" ] = ensures_or (lambda : driver .find_element (By .XPATH , "//span[contains(text(), 'Following')]/ancestor::a/span" ).text )
105
+ metadata ["followers" ] = ensures_or (lambda : driver .find_element (By .XPATH , "//span[contains(text(), 'Followers')]/ancestor::a/span" ).text )
106
106
107
107
if metadata .get ("username" , "NULL" ) == "NULL" :
108
108
raise RuntimeError ("Fatal error, unable to resolve username {}" .format (metadata ))
@@ -115,7 +115,7 @@ def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=
115
115
if not force and os .path .exists (fpath ):
116
116
print ("Folder already exists, skipping: {}" .format (fpath ))
117
117
return
118
- elif force :
118
+ elif force and os . path . exists ( fpath ) :
119
119
shutil .rmtree (fpath )
120
120
121
121
os .makedirs (fpath )
@@ -141,12 +141,12 @@ def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=
141
141
last_id_count = 0
142
142
tweets_tracker = set ()
143
143
boosted_tracker = set ()
144
- div_id_track = set ()
145
144
estimated_height = 0
145
+ div_track = set ()
146
146
try :
147
147
last_height = 0
148
148
new_height = 0
149
- temp_load_times = load_times
149
+ time . sleep ( random . uniform ( load_times , load_times + 2 ))
150
150
while True :
151
151
if id_tracker >= number_posts_to_cap - 1 :
152
152
break
@@ -162,50 +162,23 @@ def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=
162
162
163
163
tweets = driver .find_elements (By .CSS_SELECTOR , '[data-testid="tweet"]' )
164
164
for tweet in tweets :
165
- # Enables backwards scrolling and looking at existing divs.
166
- div_id = tweet .get_attribute ("aria-labelledby" )
167
- if div_id and div_id in div_id_track :
168
- print ("Div track is working??" )
169
- continue
170
- print ("DIV: {}" .format (div_id ))
171
- div_id_track .add (div_id )
172
-
173
165
# Try to scroll there first and retry 2x load times before giving up.
174
166
# Then bump up global load times by one.
175
- scrolled = False
176
- limit = temp_load_times + 3
177
- for lt in range (temp_load_times , limit ):
178
- try :
179
- driver .execute_script ("return arguments[0].scrollIntoView();" , tweet )
180
- driver .execute_script ("window.scrollTo(0, window.pageYOffset - 50);" )
181
- time .sleep (1 )
182
- scrolled = True
183
- # Reset load times
184
- temp_load_times = load_times
185
- break
186
- except selenium .common .exceptions .StaleElementReferenceException as e :
187
- if lt < limit - 1 :
188
- print ("Loading times are getting harder. Bumping wait time for next iteration." )
189
- print_debug ("Load time: {}" .format (lt ))
190
- try :
191
- WebDriverWait (driver , 20 ).until (EC .presence_of_element_located (tweet ))
192
- driver .execute_script ("window.scrollTo(0, window.pageYOffset - 20);" )
193
- except :
194
- pass
195
- elif temp_load_times >= load_times + 2 :
196
- print ("Even after bumping global load times, still failing. Aborting task." )
197
- raise e
198
- else :
199
- print ("This tweet just ain't load'in. Bumping global load times to: {}" .format (temp_load_times ))
200
- # Scroll backwards.
201
- driver .execute_script ("window.scrollTo(0, window.pageYOffset - 10);" )
202
- time .sleep (lt )
203
- temp_load_times += 1
204
- break
205
-
206
- if not scrolled :
207
- print ("SKIPPING" )
208
- break
167
+ try :
168
+ div_id = tweet .get_attribute ("aria-labelledby" )
169
+ if div_id in div_track :
170
+ continue
171
+
172
+ div_track .add (div_id )
173
+ driver .execute_script ("return arguments[0].scrollIntoView();" , tweet )
174
+ driver .execute_script ("window.scrollTo(0, window.pageYOffset - 50);" )
175
+ except :
176
+ continue
177
+
178
+ height = float (driver .execute_script ("return window.scrollTop || window.pageYOffset;" ))
179
+ if height < estimated_height :
180
+ continue
181
+ estimated_height = height
209
182
210
183
tm = {"id" : id_tracker }
211
184
tm ["tag_text" ] = ensures_or (lambda : tweet .find_element (By .CSS_SELECTOR ,'div[data-testid="User-Names"]' ).text )
@@ -237,8 +210,6 @@ def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=
237
210
print ("ARLEAD {}" .format (dtm ))
238
211
continue
239
212
240
- estimated_height += tweet .size ["height" ]
241
-
242
213
try :
243
214
# Try to remove elements before screenshot
244
215
remove_elements (driver , ["sheetDialog" , "confirmationSheetDialog" , "mask" ])
@@ -255,16 +226,16 @@ def fetch_html(driver, url, fpath, load_times, force=False, number_posts_to_cap=
255
226
break
256
227
257
228
# Scroll!
258
- driver .execute_script ("window.scrollTo(0, {});" .format (estimated_height + 100 ))
259
229
time .sleep (random .uniform (load_times , load_times + 2 ))
230
+ driver .execute_script ("window.scrollTo(0, {});" .format (estimated_height + 10 ))
260
231
new_height = driver .execute_script ("return document.body.scrollHeight" )
261
232
if new_height == last_height :
262
233
break
263
234
last_height = new_height
235
+
264
236
except selenium .common .exceptions .StaleElementReferenceException as e :
265
237
print ("Tweet limit reached, for {} unable to fetch more data. Authentication is required." .format (username ))
266
238
print ("Or you can try to bump loading times." )
267
- input ()
268
239
raise e
269
240
except Exception as e :
270
241
raise e
0 commit comments