diff --git a/core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4 b/core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4 index 0aa6ab1ef8..31c720960e 100644 --- a/core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4 +++ b/core/src/main/antlr4/org/evomaster/core/parser/RegexEcma262.g4 @@ -22,7 +22,7 @@ grammar RegexEcma262; //------ PARSER ------------------------------ // Parser rules have first letter in lower-case -pattern : disjunction; +pattern : disjunction EOF; disjunction @@ -96,13 +96,13 @@ atom //TODO -//CharacterEscape +CharacterEscape // : ControlEscape // | 'c' ControlLetter -// | HexEscapeSequence -// | UnicodeEscapeSequence + : HexEscapeSequence + | UnicodeEscapeSequence //| IdentityEscape -// ; + ; //TODO //ControlEscape @@ -205,7 +205,7 @@ AtomEscape : '\\' CharacterClassEscape //TODO // | '\\' DecimalEscape -// | '\\' CharacterEscape + | '\\' CharacterEscape ; fragment CharacterClassEscape @@ -238,11 +238,17 @@ BaseChar : ~[0-9,^$\\.*+?()[\]{}|-] ; -//TODO -//HexEscapeSequence -// : 'x' HexDigit HexDigit -// ; -// +UnicodeEscapeSequence + : 'u' HexDigit HexDigit HexDigit HexDigit + ; + +HexEscapeSequence + : 'x' HexDigit HexDigit + ; + +fragment HexDigit: + [a-fA-F0-9] + ; //TODO //DecimalIntegerLiteral diff --git a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 index f9641dbec9..47eb5a2296 100644 --- a/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 +++ b/core/src/main/antlr4/org/evomaster/core/parser/RegexJava.g4 @@ -23,7 +23,7 @@ grammar RegexJava; //------ PARSER ------------------------------ // Parser rules have first letter in lower-case -pattern : disjunction; +pattern : disjunction EOF; disjunction @@ -119,13 +119,13 @@ quoteChar ; //TODO -//CharacterEscape +CharacterEscape // : ControlEscape // | 'c' ControlLetter -// | HexEscapeSequence -// | UnicodeEscapeSequence + : HexEscapeSequence + | UnicodeEscapeSequence //| IdentityEscape -// ; + ; //TODO //ControlEscape @@ -230,7 +230,7 @@ AtomEscape : '\\' CharacterClassEscape //TODO // | '\\' DecimalEscape -// | '\\' CharacterEscape + | '\\' CharacterEscape ; fragment CharacterClassEscape @@ -267,11 +267,17 @@ BaseChar : ~[0-9,^$\\.*+?()[\]{}|-] ; -//TODO -//HexEscapeSequence -// : 'x' HexDigit HexDigit -// ; -// +UnicodeEscapeSequence: + 'u' HexDigit HexDigit HexDigit HexDigit +; + +HexEscapeSequence + : 'x' HexDigit HexDigit + ; + +fragment HexDigit: + [a-fA-F0-9] + ; //TODO //DecimalIntegerLiteral diff --git a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt index 74cb86749b..3cdb7786cb 100644 --- a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt +++ b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexEcma262Visitor.kt @@ -2,6 +2,7 @@ package org.evomaster.core.parser import org.evomaster.core.search.gene.regex.* +private const val EOF_TOKEN = "" /** * Parser Visitor based on the RegexEcma262.g4 grammar file */ @@ -16,7 +17,8 @@ class GeneRegexEcma262Visitor : RegexEcma262BaseVisitor(){ val disjList = DisjunctionListRxGene(res.genes.map { it as DisjunctionRxGene }) - val gene = RegexGene("regex", disjList,"${RegexGene.JAVA_REGEX_PREFIX}$text") + // we remove the token from end of the string to store as sourceRegex + val gene = RegexGene("regex", disjList,"${RegexGene.JAVA_REGEX_PREFIX}${text.substring(0,text.length - EOF_TOKEN.length)}") return VisitResult(gene) } @@ -166,9 +168,22 @@ class GeneRegexEcma262Visitor : RegexEcma262BaseVisitor(){ return VisitResult(gene) } - if(ctx.AtomEscape() != null){ - val char = ctx.AtomEscape().text[1].toString() - return VisitResult(CharacterClassEscapeRxGene(char)) + if(ctx.AtomEscape() != null) { + val txt = ctx.AtomEscape().text + when { + txt[1] == 'x' || txt[1] == 'u' -> { + val hexValue = + txt.subSequence(2, txt.length).toString().toInt(16) + return VisitResult( + PatternCharacterBlockGene( + txt, + hexValue.toChar().toString() + ) + ) + } + + else -> return VisitResult(CharacterClassEscapeRxGene(txt[1].toString())) + } } if(ctx.disjunction() != null){ diff --git a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt index 2fe1081538..bd25b11714 100644 --- a/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt +++ b/core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt @@ -2,6 +2,7 @@ package org.evomaster.core.parser import org.evomaster.core.search.gene.regex.* +private const val EOF_TOKEN = "" /** * Created by arcuri82 on 11-Sep-19. */ @@ -16,7 +17,8 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor(){ val disjList = DisjunctionListRxGene(res.genes.map { it as DisjunctionRxGene }) - val gene = RegexGene("regex", disjList,"${RegexGene.JAVA_REGEX_PREFIX}$text") + // we remove the token from end of the string to store as sourceRegex + val gene = RegexGene("regex", disjList,"${RegexGene.JAVA_REGEX_PREFIX}${text.substring(0, text.length - EOF_TOKEN.length)}") return VisitResult(gene) } @@ -179,8 +181,20 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor(){ } if(ctx.AtomEscape() != null){ - val char = ctx.AtomEscape().text[1].toString() - return VisitResult(CharacterClassEscapeRxGene(char)) + val txt = ctx.AtomEscape().text + when { + txt[1] == 'x' || txt[1] == 'u' -> { + val hexValue = + txt.subSequence(2, txt.length).toString().toInt(16) + return VisitResult( + PatternCharacterBlockGene( + txt, + hexValue.toChar().toString() + ) + ) + } + else -> return VisitResult(CharacterClassEscapeRxGene(txt[1].toString())) + } } if(ctx.disjunction() != null){ diff --git a/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexEcma262VisitorTest.kt b/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexEcma262VisitorTest.kt index fbf487d70d..2a906745c9 100644 --- a/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexEcma262VisitorTest.kt +++ b/core/src/test/kotlin/org/evomaster/core/parser/GeneRegexEcma262VisitorTest.kt @@ -326,4 +326,14 @@ open class GeneRegexEcma262VisitorTest : RegexTestTemplate(){ // p = 1 / 2^6 = 1 / 64 checkCanSample("^((a|A)(b|B)(c|C)123(e|E)(f|F)(d|D))$", "aBc123EFd", 10_000) } + + @Test + fun testHexEscape(){ + checkSameAsJava("""\x00\x0a\xba\xFF""") + } + + @Test + fun testUnicodeEscape(){ + checkSameAsJava("""\u0000\u0a0b\uffff""") + } } \ No newline at end of file diff --git a/core/src/test/kotlin/org/evomaster/core/parser/RegexHandlerTest.kt b/core/src/test/kotlin/org/evomaster/core/parser/RegexHandlerTest.kt index 5fd3af1e7d..b77e39c04e 100644 --- a/core/src/test/kotlin/org/evomaster/core/parser/RegexHandlerTest.kt +++ b/core/src/test/kotlin/org/evomaster/core/parser/RegexHandlerTest.kt @@ -121,4 +121,15 @@ internal class RegexHandlerTest{ } + @Test + fun testCreateGeneForJVMInvalidRegex() { + + assertThrows(ParseCancellationException::class.java) { RegexHandler.createGeneForJVM("\\xR") } + } + + @Test + fun testCreateGeneForEcma262InvalidRegex() { + + assertThrows(ParseCancellationException::class.java) { RegexHandler.createGeneForEcma262("\\xR") } + } } \ No newline at end of file