Skip to content

Commit 3900b9e

Browse files
committed
fix scrape page fetch logic
1 parent 2b7b01f commit 3900b9e

File tree

1 file changed

+10
-5
lines changed

1 file changed

+10
-5
lines changed

dev_projects/ScrapingService/src/test/java/com/yen/WebScrapingTest.java

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -168,13 +168,13 @@ public void webScrapTestPokemonV2() throws IOException {
168168
// the number of iteration executed
169169
int i = 0;
170170
// to limit the number to scrape to 5
171-
int limit = 5;
171+
int limit = 5; //48;
172172

173173
while (!pagesToScrape.isEmpty() && i < limit) {
174174
System.out.println(">>> i = " + i);
175175

176176
/** help func*/
177-
scrapeProductPage(pokemonProducts, pagesDiscovered, pagesToScrape);
177+
scrapeProductPage(pokemonProducts, pagesDiscovered, pagesToScrape, i);
178178

179179
// incrementing the iteration number
180180
i++;
@@ -187,7 +187,8 @@ public void webScrapTestPokemonV2() throws IOException {
187187

188188
private void scrapeProductPage(List<PokemonProduct> pokemonProducts,
189189
Set<String> pagesDiscovered,
190-
List<String> pagesToScrape ) throws IOException {
190+
List<String> pagesToScrape,
191+
Integer i) throws IOException {
191192

192193

193194
System.out.println(">>> (scrapeProductPage) pagesDiscovered = " + pagesDiscovered + " pagesToScrape = " + pagesToScrape);
@@ -203,8 +204,9 @@ private void scrapeProductPage(List<PokemonProduct> pokemonProducts,
203204
// doc = Jsoup.connect(URL).get();
204205
// initializing the HTML Document page variable
205206
Document doc;
206-
String URL = "https://scrapeme.live/shop";
207-
doc = Jsoup
207+
String URL = "https://scrapeme.live/shop"+"/page/" + i;
208+
System.out.println("URL = " + URL);
209+
doc = Jsoup
208210
.connect(URL)
209211
.userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36")
210212
.header("Accept-Language", "*")
@@ -213,6 +215,9 @@ private void scrapeProductPage(List<PokemonProduct> pokemonProducts,
213215

214216
// iterating over the pagination HTML elements
215217
for (Element pageElement : paginationElements) {
218+
219+
//System.out.println(">>> pageElement = " + pageElement.text());
220+
216221
// the new link discovered
217222
String pageUrl = pageElement.attr("href");
218223

0 commit comments

Comments
 (0)