Website Scraper (#12)

assayire · assayire · web-flow · commit 4a565c45a511 · 2025-06-06T14:04:42.000-07:00
* Scrape rockthejvm.com articles
* Added scraper project
* Updated README
* Using Ethereal email instead of SendGrid
* Updated documentation
* Updated documentation
* Updated documentation

---------

Co-authored-by: assayire &lt;assayire@noreply.github.com&gt;
diff --git a/build.sbt b/build.sbt
@@ -1,7 +1,5 @@
-name := "scala-projects-playground"
-
-version := "0.1"
-
+name         := "scala-projects-playground"
+version      := "0.1"
 scalaVersion := "3.3.4"
 
 libraryDependencies ++= Seq(
@@ -13,7 +11,8 @@ libraryDependencies ++= Seq(
   "com.lihaoyi" %% "fastparse" % "3.1.1",
   // Java libraries
   // scraping
-  "org.jsoup" % "jsoup" % "1.19.1",
+  "org.jsoup"               % "jsoup"                      % "1.20.1",
+  "org.scala-lang.modules" %% "scala-parallel-collections" % "1.2.0",
   // markdown
   "org.commonmark" % "commonmark" % "0.24.0",
   // http apis
diff --git a/chat-app/README.md b/chat-app/README.md
@@ -10,8 +10,9 @@
 ## Running the application
 
 1. Run the client with `sbt "~appJS/fastOptJS"` to keep the client files up to date with the changes you make to `js (appJS)` project.
-2. TODO: Figure out how to run the server from command line. `sbt runMain com.rtjvm.chat.backend.Server` or `sbt run` does not launch the server. **For now, you should be able to run it from IntelliJ**. Go to `Server.scala` and run the file.
-3. Chat application should be accessible at http://localhost:8080/static/index.html
+2. You can run the server from the IDE by loading the entire project and running the `Server` class.
+3. Or you can run the server from the command line from _within the `chat-app` folder_: `sbt runMain com.rtjvm.chat.backend.Server`.
+4. Chat application should be accessible at http://localhost:8080/static/index.html
 
 ## Project Info
 
@@ -37,7 +38,7 @@ Not using `synchronized` but using a `ConcurrentHashMap`.
 
 **The online examples so far provide a simple test suite, that uses `String.contains` .... Use the Jsoup library we saw Chapter 11: Scraping Websites to make ... tag**
 
-Not Implemented! TBD!
+Not Implemented! As we discussed, we are not doing any tests.
 
 **Keep track each message's send time and date in the database, and display it in the user interface**
 
diff --git a/chat-app/build.sbt b/chat-app/build.sbt
@@ -12,6 +12,7 @@ lazy val app =
     .in(file("."))
     .settings(
       name := "chat-app",
+      fork := true,
       libraryDependencies ++=
         "com.lihaoyi"   %%% "upickle"       % "4.1.0" ::
           "com.lihaoyi" %%% "scalatags"     % "0.12.0" ::
diff --git a/chat-app/jvm/src/main/scala/com/rtjvm/chat/backend/Server.scala b/chat-app/jvm/src/main/scala/com/rtjvm/chat/backend/Server.scala
@@ -34,10 +34,10 @@ object Server extends cask.MainRoutes {
 
   @cask.postJson("/chat")
   def postChatMsg(
-      sender:    String,
-      msg:       String,
-      parent:    Option[Long] = None,
-      timestamp: Option[Long] = None
+    sender:    String,
+    msg:       String,
+    parent:    Option[Long] = None,
+    timestamp: Option[Long] = None
   ): ujson.Value =
     (sender.trim, msg.trim) match
       case ("", _) => writeJs(ChatResponse.error("Name cannot be empty"))
@@ -84,7 +84,12 @@ object Server extends cask.MainRoutes {
     write(Greeting(s"Hello $name, from Scala.js backend! $token"))
 
   @cask.staticFiles("/static")
-  def staticFileRoutes() = "chat-app/js/static"
+  def staticFileRoutes(): String =
+    val userDir    = System.getProperty("user.dir")
+    val staticPath = os.Path(userDir) / "chat-app" / "js" / "static"
+
+    if os.exists(staticPath) then staticPath.toString         // when running from IDE
+    else (os.Path(userDir) / ".." / "js" / "static").toString // when running from chat-app folder on the command line
 
   private def createDataDir(): String =
     val dataDir = os.home / "pgdata"
diff --git a/filesync/README.md b/filesync/README.md
@@ -6,11 +6,15 @@ Also, when running the app from IntelliJ, configure the run configuration for `s
 
 ## Exercises
 
-- Syncing folders/sub-folders
+- **Syncing folders/sub-folders**
 
   Track `Rpc.CreateFolder` case class
 
-- Syncing deleted files/folders 
+  ![etc/create_folder.png](etc/create_folder.png)
+
+- **Syncing deleted files/folders** 
   
   Track `Rpc.DeletePath` case class
 
+  ![etc/delete_path.png](etc/delete_path.png)
+
diff --git a/filesync/etc/create_folder.png b/filesync/etc/create_folder.png
diff --git a/filesync/etc/delete_path.png b/filesync/etc/delete_path.png
diff --git a/scrape/README.md b/scrape/README.md
@@ -0,0 +1,5 @@
+# Scrape
+
+News headlines scraper using Jsoup, Quartz scheduler and Ethereal email.
+
+P.S: There is also another scraper for scraping Rock the JVM blog posts under [`scraping`](../src/main/scala/scraping).
diff --git a/scrape/build.sbt b/scrape/build.sbt
@@ -0,0 +1,15 @@
+lazy val scrape =
+  project
+    .in(file("."))
+    .settings(
+      name         := "scrape",
+      version      := "0.1.0-SNAPSHOT",
+      scalaVersion := "3.7.0",
+      libraryDependencies ++=
+        "org.jsoup"                 % "jsoup"                      % "1.20.1" ::
+          "org.scala-lang.modules" %% "scala-parallel-collections" % "1.2.0" ::
+          "org.quartz-scheduler"    % "quartz"                     % "2.5.0" ::
+          "org.quartz-scheduler"    % "quartz-jobs"                % "2.5.0" ::
+          "com.sun.mail"            % "javax.mail"                 % "1.6.2" ::
+          Nil
+    )
diff --git a/scrape/project/build.properties b/scrape/project/build.properties
@@ -0,0 +1 @@
+sbt.version=1.10.11
diff --git a/scrape/src/main/scala/scrape/Ethereal.scala b/scrape/src/main/scala/scrape/Ethereal.scala
@@ -0,0 +1,52 @@
+package scrape
+
+import java.util.{Properties, UUID}
+import javax.mail.*
+import javax.mail.internet.*
+
+object Ethereal:
+  def sendEmail(to: String, subject: String, body: String): Unit = {
+    val session =
+      smtpSession(
+        System.getenv("SMTP_USERNAME"),
+        System.getenv("SMTP_PASSWORD"),
+        smtpProperties()
+      )
+
+    try {
+      Transport.send {
+        val msg = new MimeMessage(session)
+        msg.setFrom(new InternetAddress("no-reply@my-domain.net"))
+        msg.setRecipients(Message.RecipientType.TO, to)
+        msg.setSubject(subject)
+        msg.setContent(body, "text/html")
+        msg.setHeader("Message-ID", UUID.randomUUID().toString)
+        msg
+      }
+
+      println("Email sent successfully!")
+    } catch {
+      case e: MessagingException =>
+        e.printStackTrace()
+    }
+  }
+
+  private def smtpSession(email: String, password: String, props: Properties): Session = {
+    Session.getInstance(
+      props,
+      new Authenticator {
+        override def getPasswordAuthentication: PasswordAuthentication = {
+          new PasswordAuthentication(email, password)
+        }
+      }
+    )
+  }
+
+  private def smtpProperties(): Properties = {
+    val props = new Properties()
+    props.put("mail.smtp.host", "smtp.ethereal.email")
+    props.put("mail.smtp.port", "587")
+    props.put("mail.smtp.auth", "true")
+    props.put("mail.smtp.starttls.enable", "true") // For TLS
+    props
+  }
diff --git a/scrape/src/main/scala/scrape/Guardian.scala b/scrape/src/main/scala/scrape/Guardian.scala
@@ -0,0 +1,32 @@
+package scrape
+
+import org.jsoup.Jsoup
+
+import scala.collection.parallel.CollectionConverters.*
+import scala.jdk.CollectionConverters.*
+
+case class Headline(title: String, url: String)
+
+object Guardian:
+  private final val UserAgent =
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.101.76 Safari/537.36"
+
+  private val pageSelectorMap =
+    "https://www.theguardian.com/us"         -> "#container-news>ul>li a" ::
+      "https://www.theguardian.com/world"    -> "div[id *= container-]>ul>li a" ::
+      "https://www.theguardian.com/us/sport" -> "div#container-sports>ul>li a" ::
+      Nil
+
+  def scrapeHeadlines(): Seq[Headline] =
+    pageSelectorMap.par.flatMap { case (url, selector) =>
+      Jsoup
+        .connect(url)
+        .userAgent(UserAgent)
+        .get()
+        .select(selector)
+        .asScala
+        .map { a =>
+          val title = if a.text().isEmpty then a.attr("aria-label") else a.text
+          Headline(title, a.attr("href"))
+        }
+    }.seq
diff --git a/scrape/src/main/scala/scrape/NewsAlertJob.scala b/scrape/src/main/scala/scrape/NewsAlertJob.scala
@@ -0,0 +1,18 @@
+package scrape
+
+import org.quartz.{Job, JobExecutionContext}
+
+class NewsAlertJob extends Job:
+  def execute(context: JobExecutionContext): Unit = {
+    val body =
+      Guardian
+        .scrapeHeadlines()
+        .map(h => s"<li><a href=\"${h.url}\">${h.title}</a></li>")
+        .mkString("<div><ul>", "\n\n", "</ul></div>")
+
+    Ethereal.sendEmail(
+      "some_to_address@gmail.com",
+      "News Headlines",
+      body
+    )
+  }
diff --git a/scrape/src/main/scala/scrape/NewsAlertScheduler.scala b/scrape/src/main/scala/scrape/NewsAlertScheduler.scala
@@ -0,0 +1,30 @@
+package scrape
+
+import org.quartz._
+import org.quartz.impl.StdSchedulerFactory
+
+object NewsAlertScheduler:
+  private final val JobGroup = "newsAlertJobGroup"
+
+  def main(args: Array[String]): Unit =
+    val scheduler = StdSchedulerFactory.getDefaultScheduler
+    scheduler.start()
+
+    val job = JobBuilder
+      .newJob(classOf[NewsAlertJob])
+      .withIdentity("newsAlertJob", JobGroup)
+      .build()
+
+    val trigger = TriggerBuilder
+      .newTrigger()
+      .withIdentity("newsAlertTrigger", JobGroup)
+      .startNow()
+      .withSchedule(
+        SimpleScheduleBuilder
+          .simpleSchedule()
+          .withIntervalInSeconds(10)
+          .repeatForever()
+      )
+      .build()
+
+    scheduler.scheduleJob(job, trigger)
diff --git a/src/main/scala/blog/README.md b/src/main/scala/blog/README.md
@@ -0,0 +1,3 @@
+# Blog
+
+Publishing the blog is done using GitHub Actions. See [publish-blog.yml](../../../../.github/workflows/publish-blog.yml)
diff --git a/src/main/scala/scraping/RockTheJVM.scala b/src/main/scala/scraping/RockTheJVM.scala
@@ -0,0 +1,58 @@
+package scraping
+
+import org.jsoup.Jsoup
+
+import scala.collection.parallel.CollectionConverters.*
+import scala.jdk.CollectionConverters.*
+
+case class Article(title: String, url: String, tags: Seq[String])
+
+/**
+ * Crawls the RockTheJVM blog posts by chunking and scraping in parallel, and returns a map of tags to articles
+ */
+object RockTheJVM extends App {
+  private val noOfPages = scrapNoOfPages()
+  println(s"NoOfPages: $noOfPages")
+
+  private val tagArticlesMap: Map[String, List[Article]] =
+    (1 to noOfPages)
+      .grouped(5)
+      .toVector
+      .par
+      .flatMap { group =>
+        println(s"Processing batch: ${group.min} to ${group.max}")
+
+        group.flatMap { page =>
+          println(s"Processing page: https://rockthejvm.com/articles/$page")
+          Jsoup
+            .connect(s"https://rockthejvm.com/articles/$page")
+            .get()
+            .select("article")
+            .asScala
+            .map { article =>
+              val title = article.select("h2").text()
+              val url   = article.select("a[href^=\"/articles/\"]").attr("href")
+              val tags  = article.select("div>a[href^=\"/tags/\"]").asScala.map(_.text()).toList
+              Article(title, url, tags)
+            }
+        }
+      }
+      .flatMap(article => article.tags.map(tag => (tag, article)))
+      .seq // Convert back to a sequential collection
+      .groupMap(_._1)(_._2)
+      .view
+      .mapValues(_.toList)
+      .toMap
+
+  private def scrapNoOfPages(): Int =
+    Jsoup
+      .connect("https://rockthejvm.com/articles/1")
+      .get()
+      .select("footer>nav>div.hidden")
+      .first()
+      .select("a[href*=\"/articles/\"]:last-child")
+      .text()
+      .toInt
+
+  println(tagArticlesMap.mkString("\n"))
+}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Blog`
	`2`	`+`
	`3`	`+Publishing the blog is done using GitHub Actions. See [publish-blog.yml](../../../../.github/workflows/publish-blog.yml)`