Skip to content
28 changes: 17 additions & 11 deletions core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ grammar RegexEcma262;
//------ PARSER ------------------------------
// Parser rules have first letter in lower-case

pattern : disjunction;
pattern : disjunction EOF;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why this EOF?
how would it work when dealing with strings that don't have it?



disjunction
Expand Down Expand Up @@ -96,13 +96,13 @@ atom


//TODO
//CharacterEscape
CharacterEscape
// : ControlEscape
// | 'c' ControlLetter
// | HexEscapeSequence
// | UnicodeEscapeSequence
: HexEscapeSequence
| UnicodeEscapeSequence
//| IdentityEscape
// ;
;

//TODO
//ControlEscape
Expand Down Expand Up @@ -205,7 +205,7 @@ AtomEscape
: '\\' CharacterClassEscape
//TODO
// | '\\' DecimalEscape
// | '\\' CharacterEscape
| '\\' CharacterEscape
;

fragment CharacterClassEscape
Expand Down Expand Up @@ -238,11 +238,17 @@ BaseChar
: ~[0-9,^$\\.*+?()[\]{}|-]
;

//TODO
//HexEscapeSequence
// : 'x' HexDigit HexDigit
// ;
//
UnicodeEscapeSequence
: 'u' HexDigit HexDigit HexDigit HexDigit
;

HexEscapeSequence
: 'x' HexDigit HexDigit
;

fragment HexDigit:
[a-fA-F0-9]
;

//TODO
//DecimalIntegerLiteral
Expand Down
28 changes: 17 additions & 11 deletions core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ grammar RegexJava;
//------ PARSER ------------------------------
// Parser rules have first letter in lower-case

pattern : disjunction;
pattern : disjunction EOF;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see previous comment



disjunction
Expand Down Expand Up @@ -119,13 +119,13 @@ quoteChar
;

//TODO
//CharacterEscape
CharacterEscape
// : ControlEscape
// | 'c' ControlLetter
// | HexEscapeSequence
// | UnicodeEscapeSequence
: HexEscapeSequence
| UnicodeEscapeSequence
//| IdentityEscape
// ;
;

//TODO
//ControlEscape
Expand Down Expand Up @@ -230,7 +230,7 @@ AtomEscape
: '\\' CharacterClassEscape
//TODO
// | '\\' DecimalEscape
// | '\\' CharacterEscape
| '\\' CharacterEscape
;

fragment CharacterClassEscape
Expand Down Expand Up @@ -267,11 +267,17 @@ BaseChar
: ~[0-9,^$\\.*+?()[\]{}|-]
;

//TODO
//HexEscapeSequence
// : 'x' HexDigit HexDigit
// ;
//
UnicodeEscapeSequence:
'u' HexDigit HexDigit HexDigit HexDigit
;

HexEscapeSequence
: 'x' HexDigit HexDigit
;

fragment HexDigit:
[a-fA-F0-9]
;

//TODO
//DecimalIntegerLiteral
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package org.evomaster.core.parser

import org.evomaster.core.search.gene.regex.*

private const val EOF_TOKEN = "<EOF>"
/**
* Parser Visitor based on the RegexEcma262.g4 grammar file
*/
Expand All @@ -16,7 +17,8 @@ class GeneRegexEcma262Visitor : RegexEcma262BaseVisitor<VisitResult>(){

val disjList = DisjunctionListRxGene(res.genes.map { it as DisjunctionRxGene })

val gene = RegexGene("regex", disjList,"${RegexGene.JAVA_REGEX_PREFIX}$text")
// we remove the <EOF> token from end of the string to store as sourceRegex
val gene = RegexGene("regex", disjList,"${RegexGene.JAVA_REGEX_PREFIX}${text.substring(0,text.length - EOF_TOKEN.length)}")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what if the text does not have EOF?


return VisitResult(gene)
}
Expand Down Expand Up @@ -166,9 +168,22 @@ class GeneRegexEcma262Visitor : RegexEcma262BaseVisitor<VisitResult>(){
return VisitResult(gene)
}

if(ctx.AtomEscape() != null){
val char = ctx.AtomEscape().text[1].toString()
return VisitResult(CharacterClassEscapeRxGene(char))
if(ctx.AtomEscape() != null) {
val txt = ctx.AtomEscape().text
when {
txt[1] == 'x' || txt[1] == 'u' -> {
val hexValue =
txt.subSequence(2, txt.length).toString().toInt(16)
return VisitResult(
PatternCharacterBlockGene(
txt,
hexValue.toChar().toString()
)
)
}

else -> return VisitResult(CharacterClassEscapeRxGene(txt[1].toString()))
}
}

if(ctx.disjunction() != null){
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package org.evomaster.core.parser

import org.evomaster.core.search.gene.regex.*

private const val EOF_TOKEN = "<EOF>"
/**
* Created by arcuri82 on 11-Sep-19.
*/
Expand All @@ -16,7 +17,8 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){

val disjList = DisjunctionListRxGene(res.genes.map { it as DisjunctionRxGene })

val gene = RegexGene("regex", disjList,"${RegexGene.JAVA_REGEX_PREFIX}$text")
// we remove the <EOF> token from end of the string to store as sourceRegex
val gene = RegexGene("regex", disjList,"${RegexGene.JAVA_REGEX_PREFIX}${text.substring(0, text.length - EOF_TOKEN.length)}")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see previous comment


return VisitResult(gene)
}
Expand Down Expand Up @@ -179,8 +181,20 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
}

if(ctx.AtomEscape() != null){
val char = ctx.AtomEscape().text[1].toString()
return VisitResult(CharacterClassEscapeRxGene(char))
val txt = ctx.AtomEscape().text
when {
txt[1] == 'x' || txt[1] == 'u' -> {
val hexValue =
txt.subSequence(2, txt.length).toString().toInt(16)
return VisitResult(
PatternCharacterBlockGene(
txt,
hexValue.toChar().toString()
)
)
}
else -> return VisitResult(CharacterClassEscapeRxGene(txt[1].toString()))
}
}

if(ctx.disjunction() != null){
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -326,4 +326,14 @@ open class GeneRegexEcma262VisitorTest : RegexTestTemplate(){
// p = 1 / 2^6 = 1 / 64
checkCanSample("^((a|A)(b|B)(c|C)123(e|E)(f|F)(d|D))$", "aBc123EFd", 10_000)
}

@Test
fun testHexEscape(){
checkSameAsJava("""\x00\x0a\xba\xFF""")
}

@Test
fun testUnicodeEscape(){
checkSameAsJava("""\u0000\u0a0b\uffff""")
}
}
11 changes: 11 additions & 0 deletions core/src/test/kotlin/org/evomaster/core/parser/RegexHandlerTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -121,4 +121,15 @@ internal class RegexHandlerTest{

}

@Test
fun testCreateGeneForJVMInvalidRegex() {

assertThrows(ParseCancellationException::class.java) { RegexHandler.createGeneForJVM("\\xR") }
}

@Test
fun testCreateGeneForEcma262InvalidRegex() {

assertThrows(ParseCancellationException::class.java) { RegexHandler.createGeneForEcma262("\\xR") }
}
}