Skip to content

Commit

Permalink
Не определил спам #32
Browse files Browse the repository at this point in the history
  • Loading branch information
peterarsentev committed Oct 11, 2024
1 parent f872866 commit 354d3f9
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 27 deletions.
45 changes: 38 additions & 7 deletions src/main/kotlin/pro/dionea/service/IdentifyLang.kt
Original file line number Diff line number Diff line change
@@ -1,19 +1,50 @@
package pro.dionea.service

class IdentifyLang(val lex: Set<String>) {
class IdentifyLang(private val lex: Set<String>) {
private val englishRange = 'a' .. 'z'
private val russianRange = 'а'.. 'я'

enum class Lang {
RUS, ENG
RUS, ENG, MIXED, UNDEFINED
}

private fun Set<String>.countBy(range: CharRange): Int =
this.count { word -> word.any { it in range } }
flatMap { it.asSequence() }
.count { it in range }

fun lang() : Lang = if (lex.countBy(englishRange) >= lex.countBy(russianRange)) {
Lang.ENG
} else {
Lang.RUS
fun sizeByLang() : Map<Lang, Int>
= mapOf(
Lang.RUS to lex.countBy(russianRange),
Lang.ENG to lex.countBy(englishRange)
)

private fun String.lang() : Lang {
val rus = count { it in russianRange }
val eng = count { it in englishRange }
return if (rus != 0 && eng != 0) {
Lang.MIXED
} else if (rus != 0) {
Lang.RUS
} else if (eng != 0) {
Lang.ENG
} else {
Lang.UNDEFINED
}
}

fun lang() : Lang {
var lang = Lang.UNDEFINED
for (word in lex) {
val wordLang = word.lang()
if (wordLang == Lang.UNDEFINED) {
continue
}
if (lang == Lang.UNDEFINED) {
lang = wordLang
} else if (lang != wordLang) {
return Lang.MIXED
}
}
return lang
}
}
18 changes: 4 additions & 14 deletions src/main/kotlin/pro/dionea/service/SpamAnalysis.kt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ class SpamAnalysis(
companion object {
val CONTACT_PATTERN = Pattern.compile("@\\w+")
const val MIN_SIZE_OF_MESSAGE = 35
const val CONVERTED_LETTERS = 20
}

fun isSpam(text: String): SpamReason {
Expand All @@ -32,7 +31,6 @@ class SpamAnalysis(
if (emojis.isNotEmpty() && count > 0) {
return SpamReason(true, "Содержит эмодзи и контактный логин.")
}
val converted = ConvertedLetter()
val lex = EmojiParser.removeAllEmojis(text)
.replace("[.,+~?!:;(){}\n/]".toRegex(), " ")
.split("\\s+".toRegex())
Expand All @@ -41,17 +39,9 @@ class SpamAnalysis(
.map { it.lowercase() }
.toSet()
val lang = IdentifyLang(lex).lang()
val words =
if (lang == IdentifyLang.Lang.RUS) {
val convertedLetter = converted.englishToRussian(lex)
if (convertedLetter.second >= CONVERTED_LETTERS) {
return SpamReason(true, "Русские буквы заменены на английские.")
} else {
convertedLetter.first
}
} else {
lex
}
if (lang == IdentifyLang.Lang.MIXED) {
return SpamReason(true, "Русские буквы заменены на английские.")
}

val filters = filterService.getAll()
val fkeys = keyService.getAll().groupBy { it.filter.id }
Expand All @@ -62,7 +52,7 @@ class SpamAnalysis(
var matched = 0
for (key in keys) {
val baseWords = kvalues.get(key.id)!!.map { it.value }.toList()
val coincidences = words.containsAny(baseWords)
val coincidences = lex.containsAny(baseWords)
if (coincidences.size >= 3) {
return SpamReason(true,
"Стоп-фильтр \"${filter.name}\": ${coincidences.joinToString(", ")}\n")
Expand Down
50 changes: 46 additions & 4 deletions src/test/kotlin/pro/dionea/SpamAnalysisSignEnglishLetterTest.kt
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package pro.dionea

import org.assertj.core.api.Assertions.*
import org.junit.jupiter.api.Disabled
import org.junit.jupiter.api.Test
import pro.dionea.domain.Filter
import pro.dionea.domain.KValue
Expand All @@ -14,7 +13,6 @@ import pro.dionea.service.KValueService
import pro.dionea.service.KeyService
import pro.dionea.service.SpamAnalysis

@Disabled
class SpamAnalysisSignEnglishLetterTest {

@Test
Expand All @@ -25,7 +23,51 @@ class SpamAnalysisSignEnglishLetterTest {
val keyService = KeyService(keyRepository)
val kvalueRepository = KValueFakeRepository()
val kvalueService = KValueService(kvalueRepository)
val text = "Coтpyднuчecтвo в cфepe Crуptо, выгoдныe ycлoвия, oпыт нe нyжeн oбyчuм, cвязaтьcя: @KSTRun"
assertThat(SpamAnalysis(filterService, keyService, kvalueService).isSpam(text).spam).isTrue()
val text = "Coтpyднuчecтвo в cфepe Crуptо, " +
"выгoдныe ycлoвия, " +
"oпыт нe нyжeн oбyчuм, cвязaтьcя: @KSTRun"
assertThat(SpamAnalysis(filterService, keyService, kvalueService)
.isSpam(text).spam).isTrue()
}


@Test
fun whenContainsU() {
val filterRepository = FilterFakeRepository()
val filterService = FilterService(filterRepository)
val keyRepository = KeyFakeRepository()
val keyService = KeyService(keyRepository)
val kvalueRepository = KValueFakeRepository()
val kvalueService = KValueService(kvalueRepository)
val filter = filterRepository.save(Filter(1))
val keyJob = keyRepository.save(Key(1, filter))
kvalueRepository.save(KValue(1, keyJob, "доход"))
val keyMessage = keyRepository.save(Key(2, filter))
kvalueRepository.save(KValue(2, keyMessage, "писать"))
val text = "Кoму uнтересен удaленный дохoд с хoрошей прuбылью,пuсать в лc"
assertThat(
SpamAnalysis(filterService, keyService, kvalueService)
.isSpam(text).spam)
.isTrue()
}

@Test
fun whenContainsPassive() {
val filterRepository = FilterFakeRepository()
val filterService = FilterService(filterRepository)
val keyRepository = KeyFakeRepository()
val keyService = KeyService(keyRepository)
val kvalueRepository = KValueFakeRepository()
val kvalueService = KValueService(kvalueRepository)
val filter = filterRepository.save(Filter(1))
val keyJob = keyRepository.save(Key(1, filter))
kvalueRepository.save(KValue(1, keyJob, "пасив"))
val keyMessage = keyRepository.save(Key(2, filter))
kvalueRepository.save(KValue(2, keyMessage, "пиши"))
val text = "Можно получать от 300 \$ в день на пасиве .Интересно пишите +"
assertThat(
SpamAnalysis(filterService, keyService, kvalueService)
.isSpam(text).spam)
.isTrue()
}
}
11 changes: 9 additions & 2 deletions src/test/kotlin/pro/dionea/service/IdentifyLangTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class IdentifyLangTest {
fun whenTextRussia() {
assertThat(
IdentifyLang(
setOf("Писать")
setOf("писать")
).lang()
).isEqualTo(IdentifyLang.Lang.RUS)
}
Expand All @@ -17,8 +17,15 @@ class IdentifyLangTest {
fun whenMessageContainsNumber() {
assertThat(
IdentifyLang(
setOf("Писать", "1234567890")
setOf("писать", "1234567890")
).lang()
).isEqualTo(IdentifyLang.Lang.RUS)
}

@Test
fun whenMessageContainsMix() {
val lex = setOf("дохoд", "пuсать")
val size = IdentifyLang(lex).sizeByLang()
assertThat(size[IdentifyLang.Lang.RUS]).isEqualTo(9)
}
}

0 comments on commit 354d3f9

Please sign in to comment.