Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
yennanliu committed Dec 25, 2023
1 parent 06b7ff9 commit 54c3f91
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 10 deletions.
1 change: 1 addition & 0 deletions dev_projects/ScrapingService/links.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<!doctype html><html><head> <title>Example Domain</title> <meta charset="utf-8" /> <meta http-equiv="Content-type" content="text/html; charset=utf-8" /> <meta name="viewport" content="width=device-width, initial-scale=1" /> <style type="text/css"> body { background-color: #f0f0f2; margin: 0; padding: 0; font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif; } div { width: 600px; margin: 5em auto; padding: 2em; background-color: #fdfdff; border-radius: 0.5em; box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02); } a:link, a:visited { color: #38488f; text-decoration: none; } @media (max-width: 700px) { div { margin: 0 auto; width: auto; } } </style> </head><body><div> <h1>Example Domain</h1> <p>This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.</p> <p><a href="https://www.iana.org/domains/example">More information...</a></p></div></body></html>
1 change: 1 addition & 0 deletions dev_projects/ScrapingService/links.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<!doctype html><html><head> <title>Example Domain</title> <meta charset="utf-8" /> <meta http-equiv="Content-type" content="text/html; charset=utf-8" /> <meta name="viewport" content="width=device-width, initial-scale=1" /> <style type="text/css"> body { background-color: #f0f0f2; margin: 0; padding: 0; font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif; } div { width: 600px; margin: 5em auto; padding: 2em; background-color: #fdfdff; border-radius: 0.5em; box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02); } a:link, a:visited { color: #38488f; text-decoration: none; } @media (max-width: 700px) { div { margin: 0 auto; width: auto; } } </style> </head><body><div> <h1>Example Domain</h1> <p>This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.</p> <p><a href="https://www.iana.org/domains/example">More information...</a></p></div></body></html>
1 change: 1 addition & 0 deletions dev_projects/ScrapingService/links.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<!doctype html><html><head> <title>Example Domain</title> <meta charset="utf-8" /> <meta http-equiv="Content-type" content="text/html; charset=utf-8" /> <meta name="viewport" content="width=device-width, initial-scale=1" /> <style type="text/css"> body { background-color: #f0f0f2; margin: 0; padding: 0; font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif; } div { width: 600px; margin: 5em auto; padding: 2em; background-color: #fdfdff; border-radius: 0.5em; box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02); } a:link, a:visited { color: #38488f; text-decoration: none; } @media (max-width: 700px) { div { margin: 0 auto; width: auto; } } </style> </head><body><div> <h1>Example Domain</h1> <p>This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.</p> <p><a href="https://www.iana.org/domains/example">More information...</a></p></div></body></html>
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,12 @@ public void webScrapTest1() throws IOException {
String inputLine;
StringBuilder response = new StringBuilder();

// save to html/txt
Writer writer = Files.newBufferedWriter(Paths.get("links.html"));

while ((inputLine = in.readLine()) != null) {
System.out.println("inputLine = " + inputLine);
writer.append(inputLine);
response.append(inputLine);
}

Expand All @@ -50,18 +54,18 @@ public void webScrapTest1() throws IOException {
String html = response.toString();
System.out.println("html = " + html);

System.out.println("save to file");
CSVPrinter csvPrinter = new CSVPrinter(writer, CSVFormat.DEFAULT);
writer.close();

// parse HTML (select "a[href]" element)
Document doc = Jsoup.parse(html);
Elements links = doc.select("a[href]");
for (Element link : links) {
String href = link.attr("href");
System.out.println("href = " + href);
}
// Document doc = Jsoup.parse(html);
// Elements links = doc.select("a[href]");
// for (Element link : links) {
// String href = link.attr("href");
// System.out.println("href = " + href);
// }

// save to csv
System.out.println("save to csv");
Writer writer = Files.newBufferedWriter(Paths.get("links.csv"));
CSVPrinter csvPrinter = new CSVPrinter(writer, CSVFormat.DEFAULT);
}

}

0 comments on commit 54c3f91

Please sign in to comment.