Skip to content

Commit ae903e4

Browse files
committed
add scrape in parallel
1 parent 3900b9e commit ae903e4

File tree

1 file changed

+75
-4
lines changed

1 file changed

+75
-4
lines changed

dev_projects/ScrapingService/src/test/java/com/yen/WebScrapingTest.java

+75-4
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616
import java.net.URL;
1717
import java.nio.file.Files;
1818
import java.nio.file.Paths;
19-
import java.util.ArrayList;
20-
import java.util.HashSet;
21-
import java.util.List;
22-
import java.util.Set;
19+
import java.util.*;
20+
import java.util.concurrent.ExecutorService;
21+
import java.util.concurrent.Executors;
22+
import java.util.concurrent.TimeUnit;
2323

2424
public class WebScrapingTest {
2525

@@ -232,4 +232,75 @@ private void scrapeProductPage(List<PokemonProduct> pokemonProducts,
232232
}
233233
}
234234

235+
236+
// https://www.zenrows.com/blog/web-scraping-java#java-web-crawling
237+
// scrape https://scrapeme.live/shop/ with page limit and run in parallel
238+
@Test
239+
public void webScrapTestPokemonV2Parallel() throws Exception {
240+
241+
// initializing the list of Java object to store
242+
// the scraped data
243+
List<PokemonProduct> pokemonProducts = Collections.synchronizedList(new ArrayList<>());
244+
245+
// initializing the set of web page urls
246+
// discovered while crawling the target website
247+
Set<String> pagesDiscovered = Collections.synchronizedSet(new HashSet<>());
248+
249+
// initializing the queue of urls to scrape
250+
List<String> pagesToScrape = Collections.synchronizedList(new ArrayList<>());
251+
// initializing the scraping queue with the
252+
// first pagination page
253+
pagesToScrape.add("https://scrapeme.live/shop/page/1/");
254+
255+
// initializing the ExecutorService to run the
256+
// web scraping process in parallel on 4 pages at a time
257+
ExecutorService executorService = Executors.newFixedThreadPool(4);
258+
259+
/** help func
260+
*
261+
* // launching the web scraping process to discover some
262+
* // urls and take advantage of the parallelization process
263+
*/
264+
scrapeProductPage(pokemonProducts, pagesDiscovered, pagesToScrape, 0);
265+
266+
// the number of iteration executed
267+
int i = 1;
268+
// to limit the number to scrape to 5
269+
int limit = 48; //10;
270+
271+
while (!pagesToScrape.isEmpty() && i < limit) {
272+
// registering the web scraping task
273+
int finalI = i;
274+
executorService.execute(() -> {
275+
try {
276+
scrapeProductPage(pokemonProducts, pagesDiscovered, pagesToScrape, finalI);
277+
} catch (IOException e) {
278+
throw new RuntimeException(e);
279+
}
280+
});
281+
282+
// adding a 200ms delay to avoid overloading the server
283+
TimeUnit.MILLISECONDS.sleep(200);
284+
285+
// incrementing the iteration number
286+
i++;
287+
}
288+
289+
// waiting up to 300 seconds for all pending tasks to end
290+
executorService.shutdown();
291+
executorService.awaitTermination(300, TimeUnit.SECONDS);
292+
293+
System.out.println(pokemonProducts.size());
294+
}
295+
296+
// public static void scrapeProductPage(
297+
// List<PokemonProduct> pokemonProducts,
298+
// Set<String> pagesDiscovered,
299+
// List<String> pagesToScrape
300+
// ) {
301+
// // omitted for brevity...
302+
// }
303+
304+
305+
235306
}

0 commit comments

Comments
 (0)