|
1 | 1 | package com.yen;
|
2 | 2 |
|
| 3 | +import model.PokemonProduct; |
3 | 4 | import org.apache.commons.csv.CSVFormat;
|
4 | 5 | import org.apache.commons.csv.CSVPrinter;
|
5 | 6 | import org.jsoup.Jsoup;
|
|
17 | 18 | import java.net.URL;
|
18 | 19 | import java.nio.file.Files;
|
19 | 20 | import java.nio.file.Paths;
|
| 21 | +import java.util.ArrayList; |
| 22 | +import java.util.List; |
20 | 23 |
|
21 | 24 | public class WebScrapingTest {
|
22 | 25 |
|
@@ -68,4 +71,59 @@ public void webScrapTest1() throws IOException {
|
68 | 71 |
|
69 | 72 | }
|
70 | 73 |
|
| 74 | + |
| 75 | + // https://www.zenrows.com/blog/web-scraping-java#connect-to-target-website |
| 76 | + @Test |
| 77 | + public void webScrapTest2(){ |
| 78 | + |
| 79 | + String URL = "https://scrapeme.live/shop"; |
| 80 | + |
| 81 | + // initializing the HTML Document page variable |
| 82 | + Document doc; |
| 83 | + |
| 84 | + List<PokemonProduct> pokemonProducts = new ArrayList<>(); |
| 85 | + |
| 86 | + try { |
| 87 | + // fetching the target website |
| 88 | + // doc = Jsoup.connect(URL).get(); |
| 89 | + doc = Jsoup |
| 90 | + .connect(URL) |
| 91 | + .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36") |
| 92 | + .header("Accept-Language", "*") |
| 93 | + .get(); |
| 94 | + //System.out.println("doc = " + doc); |
| 95 | + |
| 96 | +// Elements products = doc.select("li.product"); |
| 97 | +// System.out.println("products = " + products); |
| 98 | + |
| 99 | + // initializing the list of Java object to store |
| 100 | + // the scraped data |
| 101 | + //List<PokemonProduct> pokemonProducts = new ArrayList<>(); |
| 102 | + |
| 103 | + // retrieving the list of product HTML elements |
| 104 | + Elements products = doc.select("li.product"); |
| 105 | + |
| 106 | + // iterating over the list of HTML products |
| 107 | + for (Element product : products) { |
| 108 | + PokemonProduct pokemonProduct = new PokemonProduct(); |
| 109 | + |
| 110 | + // extracting the data of interest from the product HTML element |
| 111 | + // and storing it in pokemonProduct |
| 112 | + pokemonProduct.setUrl(product.selectFirst("a").attr("href")); |
| 113 | + pokemonProduct.setImage(product.selectFirst("img").attr("src")); |
| 114 | + pokemonProduct.setName(product.selectFirst("h2").text()); |
| 115 | + pokemonProduct.setPrice(product.selectFirst("span").text()); |
| 116 | + |
| 117 | + // adding pokemonProduct to the list of the scraped products |
| 118 | + pokemonProducts.add(pokemonProduct); |
| 119 | + } |
| 120 | + |
| 121 | + } catch (IOException e) { |
| 122 | + throw new RuntimeException(e); |
| 123 | + } |
| 124 | + |
| 125 | + System.out.println(">>> pokemonProducts = "); |
| 126 | + System.out.println(pokemonProducts); |
| 127 | + } |
| 128 | + |
71 | 129 | }
|
0 commit comments