-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbrowserScraper.js
78 lines (66 loc) · 2.08 KB
/
browserScraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
/**
* Scape a list of tweets in the browser
*
* Copy this code into the browser and it will output the scrape results once
* the limit has been reached
*/
(async () => {
const TWEET_LIMIT = 1000;
const SCROLL_DELAY = 300;
const TWEET_WORD_FILTER = [
// Dont use sensitive topics - we don't want to joke about them
"hanau",
// Remove links and ads for the book
"kiwi-verlag",
"http://",
"https://",
];
const scrapeTweets = () => {
const tweets = Array.from(
document.querySelectorAll('article[role="article"]')
);
return tweets
.map((tweet) => {
const tweetTextElement = tweet.querySelector("div[lang]");
if (!tweetTextElement) return false;
const isRetweet = tweet.querySelector('[data-testid="socialContext"]');
if (isRetweet) return false;
const isTweetWithImage = tweet.querySelector(
'[data-testid="tweetPhoto"]'
);
if (isTweetWithImage) return false;
const tweetTextItems = Array.from(tweetTextElement.children);
let tweetText = "";
tweetTextItems.forEach((item) => {
if (item.tagName === "IMG") {
tweetText += item.getAttribute("alt");
} else {
tweetText += item.innerText;
}
});
if (
TWEET_WORD_FILTER.some((word) =>
tweetText.toLowerCase().includes(word)
)
) {
return false;
}
return tweetText;
})
.filter(Boolean);
};
let tweets = new Set();
console.log("Scraping tweets...");
while (tweets.size < TWEET_LIMIT) {
const newTweets = await scrapeTweets();
tweets = new Set([...tweets, ...newTweets]);
document.scrollingElement.scrollTop += 1000;
console.log(`Scraped ${tweets.size} tweets`);
window.tweets = tweets;
await new Promise((resolve) => setTimeout(resolve, SCROLL_DELAY));
}
console.log("Tweets scraped and are ready at window.tweets");
window.tweets = tweets;
console.log(tweets);
console.log(JSON.stringify([...tweets]).replaceAll("\\\\", "\\"));
})();