Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse sql dumps #393

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
3 changes: 2 additions & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,8 @@ lazy val sql = Project("scalawiki-sql", file("scalawiki-sql"))
.settings(libraryDependencies ++= Seq(
Library.Slick.slick,
Library.Slick.hikaricp,
"com.h2database" % "h2" % H2V
"com.h2database" % "h2" % H2V,
"com.github.jsqlparser" % "jsqlparser" % "4.6"
))

lazy val `http-extensions` = (project in file("http-extensions"))
Expand Down
118 changes: 73 additions & 45 deletions scalawiki-core/src/main/scala/org/scalawiki/dto/Image.scala
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@ case class ImageMetadata(data: Map[String, String]) {

def camera: Option[String] = data.get("Model")

def date: Option[ZonedDateTime] = data.get("DateTime")
.map(s => LocalDateTime.parse(s, ImageMetadata.df).atZone(ZoneOffset.UTC))
def date: Option[ZonedDateTime] =
data
.get("DateTime")
.map(s => LocalDateTime.parse(s, ImageMetadata.df).atZone(ZoneOffset.UTC))
}

object ImageMetadata {
Expand All @@ -35,14 +37,14 @@ case class Image(title: String,
pageId: Option[Long] = None,
metadata: Option[ImageMetadata] = None,
categories: Set[String] = Set.empty,
specialNominations: Set[String] = Set.empty
) extends Ordered[Image] {
specialNominations: Set[String] = Set.empty)
extends Ordered[Image] {

def compare(that: Image) = title.compareTo(that.title)
def compare(that: Image): Int = title.compareTo(that.title)

def monumentId: Option[String] = monumentIds.headOption

def download(filename: String) {
def download(filename: String): Unit = {
import scala.concurrent.ExecutionContext.Implicits.global
for (bytes <- MwBot.fromSite(Site.commons).getByteArray(url.get))
Files.write(Paths.get(filename), bytes)
Expand All @@ -52,18 +54,21 @@ case class Image(title: String,

def mpx: Option[Float] = pixels.map(_ / Math.pow(10, 6)).map(_.toFloat)

def atLeastMpx(minMpxOpt: Option[Float]): Boolean = minMpxOpt.fold(true)(minMpx => mpx.exists(_ >= minMpx))
def atLeastMpx(minMpxOpt: Option[Float]): Boolean =
minMpxOpt.fold(true)(minMpx => mpx.exists(_ >= minMpx))

def mpxStr: String = mpx.fold("")(v => f"$v%1.2f Mpx ")

def resolution: Option[String] = for (w <- width; h <- height) yield w + " x " + h
def resolution: Option[String] =
for (w <- width; h <- height) yield w + " x " + h

def resizeTo(resizeToX: Int, resizeToY: Int): Int =
Image.resizedWidth(width.get, height.get, resizeToX, resizeToY)

def withAuthor(newAuthor: String): Image = this.copy(author = Some(newAuthor))

def withMonument(monumentId: String): Image = this.copy(monumentIds = Seq(monumentId))
def withMonument(monumentId: String): Image =
this.copy(monumentIds = Seq(monumentId))
}

object Image {
Expand All @@ -73,14 +78,19 @@ object Image {
def fromPageImages(page: Page): Option[Image] =
page.images.headOption

def fromPageRevision(page: Page, monumentIdTemplate: Option[String], specialNominationTemplates: Seq[String] = Nil): Option[Image] = {
def fromPageRevision(
page: Page,
monumentIdTemplate: Option[String],
specialNominationTemplates: Seq[String] = Nil): Option[Image] = {
page.revisions.headOption.map { revision =>
val content = revision.content.getOrElse("")
val parsedPage = TemplateParser.parsePage(content)
val ids = monumentIdTemplate.toList.flatMap { template =>
TemplateParser.collectTemplates(parsedPage, template).flatMap(_.getParamOpt("1"))
TemplateParser
.collectTemplates(parsedPage, template)
.flatMap(_.getParamOpt("1"))
}
val specialNominations = specialNominationTemplates.flatMap{ template =>
val specialNominations = specialNominationTemplates.flatMap { template =>
Some(template).filter(_ => {
val value = TemplateParser.collectTemplates(parsedPage, template)
value.nonEmpty
Expand All @@ -90,28 +100,38 @@ object Image {
val author = getAuthorFromPage(parsedPage)

// TODO category maps
val categories = categoryRegex.findAllIn(content).matchData.map(_.group(1).intern()).toSet

new Image(page.title,
val categories = categoryRegex
.findAllIn(content)
.matchData
.map(_.group(1).intern())
.toSet

new Image(
page.title,
author = Some(author),
date = revision.timestamp,
monumentIds = ids,
pageId = page.id,
categories = categories,
specialNominations = specialNominations.toSet)
specialNominations = specialNominations.toSet
)
}
}

def fromPage(page: Page, monumentIdTemplate: Option[String], specialNominationTemplates: Seq[String] = Nil): Option[Image] = {
def fromPage(page: Page,
monumentIdTemplate: Option[String],
specialNominationTemplates: Seq[String] = Nil): Option[Image] = {
for (fromImage <- Image.fromPageImages(page);
fromRev <- Image.fromPageRevision(page, monumentIdTemplate, specialNominationTemplates))
fromRev <- Image.fromPageRevision(page,
monumentIdTemplate,
specialNominationTemplates))
yield {
val renamedAuthor = fromRev.author.map(author => AuthorsMap.renames.getOrElse(author, author))
fromImage.copy(
monumentIds = fromRev.monumentIds,
author = renamedAuthor,
categories = fromRev.categories,
specialNominations = fromRev.specialNominations)
val renamedAuthor = fromRev.author.map(author =>
AuthorsMap.renames.getOrElse(author, author))
fromImage.copy(monumentIds = fromRev.monumentIds,
author = renamedAuthor,
categories = fromRev.categories,
specialNominations = fromRev.specialNominations)
}
}

Expand All @@ -121,9 +141,11 @@ object Image {

def getAuthorFromPage(parsedPage: EngPage): String = {
val template = TemplateParser.getTemplate(parsedPage, Some("Information"))
val authorValue = template.flatMap { t =>
t.getParamOpt("author").orElse(t.getParamOpt("Author"))
}.getOrElse("")
val authorValue = template
.flatMap { t =>
t.getParamOpt("author").orElse(t.getParamOpt("Author"))
}
.getOrElse("")

parseUser(authorValue)
}
Expand All @@ -135,9 +157,10 @@ object Image {

if (start < Int.MaxValue) {
val pipe = authorValue.indexOf("|", start)
val end = if (pipe >= 0)
pipe
else authorValue.length
val end =
if (pipe >= 0)
pipe
else authorValue.length
authorValue.substring(start + "user:".length, end)
} else if (authorValue.contains('[')) {
val extLinkStart = authorValue.indexOf('[')
Expand All @@ -163,20 +186,22 @@ object Image {
url: Option[String],
pageUrl: Option[String],
pageId: Option[Long],
metadata: Option[Map[String, String]] = None)
= new Image(
title = title,
date = timestamp,
uploader = uploader.map(name => User(None, Some(name))),
size = size,
width = width,
height = height,
url = url,
pageUrl = pageUrl,
pageId = pageId,
metadata = metadata.map(ImageMetadata.apply))

def gallery(images: Iterable[String], descriptions: Iterable[String] = Seq.empty): String =
metadata: Option[Map[String, String]] = None) =
new Image(
title = title,
date = timestamp,
uploader = uploader.map(name => User(None, Some(name))),
size = size,
width = width,
height = height,
url = url,
pageUrl = pageUrl,
pageId = pageId,
metadata = metadata.map(ImageMetadata.apply)
)

def gallery(images: Iterable[String],
descriptions: Iterable[String] = Seq.empty): String =
Gallery.asWiki(images, descriptions)

def resizedWidth(w: Int, h: Int, resizeToX: Int, resizeToY: Int): Int = {
Expand All @@ -189,5 +214,8 @@ object Image {
}

object AuthorsMap {
val renames = Map("ЯдвигаВереск" -> "Wereskowa")
val renames = Map(
"ЯдвигаВереск" -> "Wereskowa",
"Михаил Титаренко Александрович" -> "Тітаренко Михайло"
)
}
55 changes: 55 additions & 0 deletions scalawiki-sql/src/test/resources/category.sql

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ class DslQueryDbCacheBlackBoxSpec extends Specification with MockBotSpec with Be
val result = future.await.toSeq

result must have size 1
result(0) === Page(Some(569559L), Some(1), "Talk:Welfare reform",
result.head === Page(Some(569559L), Some(1), "Talk:Welfare reform",
Seq(Revision(Some(11L), Some(569559L), None, someUser2, None, None, Some(pageText1)))
)

Expand Down
73 changes: 73 additions & 0 deletions scalawiki-sql/src/test/scala/org/scalawiki/sql/SqlParserSpec.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
package org.scalawiki.sql

import net.sf.jsqlparser.expression.{LongValue, RowConstructor, StringValue}
import net.sf.jsqlparser.expression.operators.relational.ExpressionList
import org.scalawiki.util.TestUtils.resourceAsString
import org.specs2.mutable.Specification
import net.sf.jsqlparser.parser.{CCJSqlParser, CCJSqlParserUtil, StringProvider}
import net.sf.jsqlparser.statement.insert.Insert
import net.sf.jsqlparser.statement.select.SetOperationList
import net.sf.jsqlparser.statement.values.ValuesStatement

import scala.collection.mutable
import scala.jdk.CollectionConverters.ListHasAsScala

case class SqlCategory(id: Long,
title: String,
pages: Long,
subCats: Long,
files: Long)
class SqlParserSpec extends Specification {

"parser" should {
"parse categories" in {
// val sqlStr = resourceAsString("/category.sql")
// .split("\n")
// .filter(_.startsWith("INSERT INTO"))
// .head

val sqlStr = "INSERT INTO `category` VALUES " +
"(1,'NowCommons',0,0,0)," +
"(2,'Зображення:Герби_міст_України',7,0,7)," +
"(3,'Суспільне_надбання',13896,18,13872);"

val parser = new CCJSqlParser(new StringProvider(sqlStr))
.withBackslashEscapeCharacter(true)
.withTimeOut(30000)
val statement =
CCJSqlParserUtil.parseStatement(parser).asInstanceOf[Insert]

val categories = statement.getSelect.getSelectBody
.asInstanceOf[SetOperationList]
.getSelects
.get(0)
.asInstanceOf[ValuesStatement]
.getExpressions
.asInstanceOf[ExpressionList]
.getExpressions
.asScala
.map {
case row: RowConstructor =>
val expressions = row.getExprList.getExpressions.asScala
expressions match {
case mutable.Buffer(id: LongValue,
title: StringValue,
pages: LongValue,
subCats: LongValue,
files: LongValue) =>
SqlCategory(id.getValue,
title.getValue,
pages.getValue,
subCats.getValue,
files.getValue)
}
}
categories === mutable.Buffer(
SqlCategory(1, "NowCommons", 0, 0, 0),
SqlCategory(2, "Зображення:Герби_міст_України", 7, 0, 7),
SqlCategory(3, "Суспільне_надбання", 13896, 18, 13872)
)
}
}

}
24 changes: 23 additions & 1 deletion scalawiki-wlx/src/main/resources/wlm_ua.conf
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,23 @@
"4-9": 3,
"10-49": 1
}
}
},
"2022" : {
"number-of-authors-bonus": {
same-author-zero-bonus: false,
"0-0": 10,
"1-3": 6,
"4-6": 3,
"7-9": 1
},
"number-of-images-bonus": {
same-author-zero-bonus: false,
"0-0": 10,
"1-3": 6,
"4-9": 3,
"10-49": 1
}
}
}

"nominations": [
Expand Down Expand Up @@ -124,6 +140,12 @@
"pages": ["Template:WLM єврейська спадщина"],
"years": [2019, 2020, 2021, 2022]
},
{
"name": "Плівка",
"listTemplate": "WLM-рядок",
"fileTemplate": "WLM2022-UA-film",
"years": [2022]
},
{
"name": "Віа Регіа",
"listTemplate": "ВЛП-рядок",
Expand Down
Loading