Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
0e3e54e
Java regex added char class intersection, union (nesting) & subtraction.
lmasroca May 26, 2026
0364ea7
Allowing empty MultiCharacterRange.
lmasroca May 27, 2026
2b4638d
Allowing invalid back refs as empty regex.
lmasroca May 27, 2026
ab95861
Identifying potential empty regex for later filtering.
lmasroca May 27, 2026
0de98aa
quick fix
lmasroca May 27, 2026
2188070
Filtering empty regex alternatives from patterns, allowing for sampli…
lmasroca May 27, 2026
a646c33
Merge remote-tracking branch 'refs/remotes/origin/master' into regex-…
lmasroca May 27, 2026
45db61d
Added some tests regarding empty regex alternatives
lmasroca May 27, 2026
a9b26b4
Added more tests regarding empty regex alternatives
lmasroca May 27, 2026
466ee54
Removed bool from VisitResult used to represent unsat. regexes.
lmasroca Jun 3, 2026
160bc29
Removed redundant overrides for isEffectivelyEmpty method.
lmasroca Jun 3, 2026
65df7ec
Merge remote-tracking branch 'origin/master' into regex-support-exten…
lmasroca Jun 3, 2026
e717bae
Edge case fixed.
lmasroca Jun 3, 2026
3b1bb24
Merge remote-tracking branch 'origin/master' into regex-support-exten…
lmasroca Jun 5, 2026
a00f566
Added explicit throw when regex is unsatisfiable.
lmasroca Jun 7, 2026
1cc8e06
Merge remote-tracking branch 'origin/master' into regex-support-exten…
lmasroca Jun 7, 2026
c0a7587
Made some adjustments and added some comments.
lmasroca Jun 8, 2026
871ae0c
Merge remote-tracking branch 'origin/master' into regex-support-exten…
lmasroca Jun 8, 2026
7a26ffb
Added some comments.
lmasroca Jun 9, 2026
9bc0f0e
Merge remote-tracking branch 'origin/master' into regex-support-exten…
lmasroca Jun 9, 2026
a7e6dd7
Added another comment.
lmasroca Jun 9, 2026
7291f0b
Added comment explaining nullable DisjunctionListRxGene in capture gr…
lmasroca Jun 9, 2026
194cc96
Refactor: renamed RxTerm.isEffectivelyEmpty to isUnsatisfiable
lmasroca Jun 16, 2026
0a6840f
Merge remote-tracking branch 'origin/master' into regex-support-exten…
lmasroca Jun 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 92 additions & 36 deletions core/src/main/kotlin/org/evomaster/core/parser/GeneRegexJavaVisitor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,22 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
* Capture groups in order of appearance (1-based index -> list index 0).
* Populated as the tree is walked. A backreference is only valid if it
* appears after the group it references, which Java regex requires anyway.
* The value is nullable to represent a captured group that is unsatisfiable,
* for example when the group contains an empty character class like `([a&&b])`.
* In that case the map holds null instead of a DisjunctionListRxGene.
* @see buildDisjunctionList
*/
private val captureGroups = mutableListOf<DisjunctionListRxGene?>()

/**
* Same as [captureGroups] but for named backreferences, which can be accessed
* with their name or number.
* The value is nullable to represent a captured group that is unsatisfiable,
* for example when the group contains an empty character class like `([a&&b])`.
* In that case the map holds null instead of a DisjunctionListRxGene.
* @see buildDisjunctionList
*/
private val namedCaptureGroups = mutableMapOf<String, DisjunctionListRxGene>()
private val namedCaptureGroups = mutableMapOf<String, DisjunctionListRxGene?>()
Comment thread
lmasroca marked this conversation as resolved.

/**
* Tracks the flags active in the current lexical scope.
Expand All @@ -73,14 +81,48 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
)
}

/**
* Builds DisjunctionListRxGenes from a disjunction context, returns null if disjunction is unsatisfiable.
*/
private fun buildDisjunctionList(ctx: RegexJavaParser.DisjunctionContext): DisjunctionListRxGene? {
val res = ctx.accept(this)
val validDisjunctions = res.genes.map { it as DisjunctionRxGene }

val satisfiableDisjunctions = validDisjunctions.filter{ !it.isUnsatisfiable() }

if(satisfiableDisjunctions.isEmpty()){
// As DisjunctionListRxGene extends CompositeFixedGene, its disjunctions list cannot be empty.
// In this case we return null to represent an unsatisfiable DisjunctionListRxGene.
return null
}

val disjList = DisjunctionListRxGene(satisfiableDisjunctions)

//TODO tmp hack until full handling of ^$. Assume full match when nested disjunctions
for (gene in disjList.disjunctions) {
gene.extraPrefix = false
gene.extraPostfix = false
gene.matchStart = true
gene.matchEnd = true
}
return disjList
}

override fun visitPattern(ctx: RegexJavaParser.PatternContext): VisitResult {

val res = ctx.disjunction().accept(this)

val text = RegexUtils.getRegexExpByParserRuleContext(ctx)

val disjList = DisjunctionListRxGene(res.genes.map { it as DisjunctionRxGene })
val satisfiableDisjunctions = res.genes
.map { it as DisjunctionRxGene }
.filter{ !it.isUnsatisfiable() }

if (satisfiableDisjunctions.isEmpty()) {
throw IllegalStateException("Regex is unsatisfiable.")
}

val disjList = DisjunctionListRxGene(satisfiableDisjunctions)

// we remove the <EOF> token from end of the string to store as sourceRegex
val gene = RegexGene(
Expand All @@ -101,9 +143,19 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
val matchStart = assertionMatches.first
val matchEnd = assertionMatches.second

val disj = DisjunctionRxGene("disj", altRes.genes.map { it }, matchStart, matchEnd)
val res = VisitResult()

// add disjunction if it has genes, OR if the alternative was purely assertions (^$) or flag scopes
// in that case altRes.genes is empty but the alternative is valid (matches "")
val hasOnlyAssertionsOrFlagScopes = ctx.alternative().term().isNotEmpty() &&
ctx.alternative().term().all { it.assertion() != null || it.FLAG_SCOPE_OPEN() != null }

if (altRes.genes.isNotEmpty() || hasOnlyAssertionsOrFlagScopes || ctx.alternative().term().isEmpty()) {
val disj = DisjunctionRxGene("disj", altRes.genes.map { it }, matchStart, matchEnd)

val res = VisitResult(disj)
res.genes.add(disj)
}
// else: had non-assertion terms but all produced nothing (empty char class etc.), skip

if(ctx.disjunction() != null){
val disjRes = ctx.disjunction().accept(this)
Expand Down Expand Up @@ -169,7 +221,7 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
// term is not a back ref: we use the default behavior, term results may only have 0-1 genes
// if there is a gene, we add it to result
res.genes.add(gene)
} else {
} else if (resTerm.data is String) {

val assertion = resTerm.data as String
if(i==0 && assertion == "^"){
Expand All @@ -184,6 +236,9 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
*/
throw IllegalStateException("Cannot support $assertion at position $i")
}
} else {
// unsatisfiable term, return with no genes
return VisitResult(data=Pair(false, false))
}
}

Expand All @@ -203,12 +258,23 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){

val resAtom = ctx.atom().accept(this)
val atom = resAtom.genes.firstOrNull()
?: return res

if(ctx.quantifier() != null){

val limits = ctx.quantifier().accept(this).data as Pair<Int,Int>

// if quantified atom is unsatisfiable we must then check the limits
if (atom == null ||
((atom as? RxTerm)?.isUnsatisfiable() == true) && resAtom.genes.size == 1) {
return if (limits.first == 0) {
// if 0 appearances is allowed then the regex is satisfiable only with empty string
VisitResult(PatternCharacterBlockGene("0_QuantifierOnEmptyRegex", ""))
} else {
// if not then unsatisfiable, return with no genes
res
}
}

// if atom is not a back ref then we use the default behavior, results may only have one gene
var template: Gene = atom

Expand All @@ -233,10 +299,11 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
if (ctx.atom()?.atomEscape()?.BackReference() != null){
// if atom is a BackReference we addAll genes from result as there may be more than one if digits are dropped
res.genes.addAll(resAtom.genes)
} else {
} else if (atom != null) {
// if atom is not a back ref we fall back to the default behavior, results only have one gene
res.genes.add(atom)
}
// else atom is unsatisfiable, return no genes
}

return res
Expand Down Expand Up @@ -306,21 +373,16 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){

currentFlags = merged

val res = ctx.disjunction().accept(this)
val disjList = buildDisjunctionList(ctx.disjunction())

currentFlags = previous

val disjList = DisjunctionListRxGene(res.genes.map { it as DisjunctionRxGene })

//TODO tmp hack until full handling of ^$. Assume full match when nested disjunctions
for (gene in disjList.disjunctions) {
gene.extraPrefix = false
gene.extraPostfix = false
gene.matchStart = true
gene.matchEnd = true
return if (disjList != null) {
VisitResult(disjList)
} else {
// unsatisfiable, return with no genes.
VisitResult()
}

return VisitResult(disjList)
}

if(ctx.quote() != null){
Expand Down Expand Up @@ -354,17 +416,7 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
val groupIndex = captureGroups.size
captureGroups.add(null) // add placeholder for the gene

val res = ctx.disjunction().accept(this)

val disjList = DisjunctionListRxGene(res.genes.map { it as DisjunctionRxGene })

//TODO tmp hack until full handling of ^$. Assume full match when nested disjunctions
for(gene in disjList.disjunctions){
gene.extraPrefix = false
gene.extraPostfix = false
gene.matchStart = true
gene.matchEnd = true
}
val disjList = buildDisjunctionList(ctx.disjunction())

val isCapturingGroup = !ctx.text.startsWith("(?:")
val isNamedCaptureGroup = ctx.NAMED_CAPTURE_GROUP_OPEN() != null
Expand All @@ -380,7 +432,12 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
namedCaptureGroups[name] = disjList
}

return VisitResult(disjList)
return if (disjList != null) {
VisitResult(disjList)
} else {
// unsatisfiable, return with no genes.
VisitResult()
}
}

if(ctx.DOT() != null){
Expand Down Expand Up @@ -599,15 +656,12 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
maxDigits > allDigits.length -> allDigits.length
allDigits.take(maxDigits).toInt() <= captureGroups.size -> maxDigits
maxDigits > 1 -> maxDigits - 1
else -> throw IllegalStateException(
"Backreference ${txt.take(2)} refers to group ${allDigits[0]} but only ${captureGroups.size} " +
"capture group(s) have been defined so far"
)
else -> 1
}

val n = allDigits.take(backRefDigitCount).toInt()

val result = VisitResult(BackReferenceRxGene(n, captureGroups[n - 1]!!))
val result = VisitResult(BackReferenceRxGene(n, captureGroups.getOrNull(n - 1)))

val remainingChars = allDigits.drop(backRefDigitCount)

Expand All @@ -623,8 +677,10 @@ class GeneRegexJavaVisitor : RegexJavaBaseVisitor<VisitResult>(){
if (ctx.NamedBackReference() != null) {
// strip "\k<" and ">"
val name = txt.drop(3).dropLast(1)
if(name !in namedCaptureGroups){
throw IllegalStateException("Named backreference \\k<$name> refers to unknown group '$name'")
}
val group = namedCaptureGroups[name]
?: throw IllegalStateException("Named backreference \\k<$name> refers to unknown group '$name'")
val groupIndex = captureGroups.indexOf(group) + 1 // 1-based, for the gene name
return VisitResult(BackReferenceRxGene(groupIndex, group))
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,18 @@ import org.evomaster.core.search.service.mutator.genemutation.SubsetGeneMutation
* Represents a backreference \N in a regex (N being a number).
* Its value is always identical to the current value of its [captureGroup].
* It has no independent state and is therefore immutable.
* If capture group is null then the referenced group was unsatisfiable,
* in which case the same is true for the backreference to it.
*/
class BackReferenceRxGene(
val groupIndex: Int,
val captureGroup: DisjunctionListRxGene
val captureGroup: DisjunctionListRxGene?
) : RxAtom, SimpleGene("\\$groupIndex") {

override fun isUnsatisfiable(): Boolean {
return captureGroup == null || captureGroup.isUnsatisfiable()
}

override fun checkForLocallyValidIgnoringChildren(): Boolean = true

/**
Expand Down Expand Up @@ -59,7 +65,12 @@ class BackReferenceRxGene(
mode: GeneUtils.EscapeMode?,
targetFormat: OutputFormat?,
extraCheck: Boolean
): String = captureGroup.getValueAsPrintableString(targetFormat = null)
): String {
if (captureGroup == null) {
throw IllegalStateException("Cannot get value from invalid backreference \\$groupIndex")
}
return captureGroup.getValueAsPrintableString(previousGenes, mode, targetFormat)
}

override fun containsSameValueAs(other: Gene): Boolean {
if (other !is BackReferenceRxGene) return false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ class CharacterClassEscapeRxGene(
// create both normal and negated version for all
.flatMap { (key, value) ->
listOf(
key to MultiCharacterRange(value),
key to MultiCharacterRange(false, value),
"^$key" to MultiCharacterRange(true, value)
)
}.toMap()
Expand Down Expand Up @@ -132,6 +132,12 @@ class CharacterClassEscapeRxGene(
}
}

override fun isUnsatisfiable(): Boolean = multiCharRange.isEmpty

override fun isMutable(): Boolean {
return !isUnsatisfiable()
}

override fun checkForLocallyValidIgnoringChildren() : Boolean{
// we pass the same embedded flags to the regex to accurately match the expected behavior
return value.matches(Regex("${flags.getScopeString()}\\$type"))
Expand Down Expand Up @@ -193,6 +199,9 @@ class CharacterClassEscapeRxGene(
}

override fun getValueAsPrintableString(previousGenes: List<Gene>, mode: GeneUtils.EscapeMode?, targetFormat: OutputFormat?, extraCheck: Boolean): String {
if (isUnsatisfiable()) {
throw IllegalStateException("Cannot get value from empty CharacterClassEscape")
}
return if (!flags.isCaseable(value[0])) {
value[0].toString()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,16 @@ class CharacterRangeRxGene(
private val log = LoggerFactory.getLogger(CharacterRangeRxGene::class.java)
}

var value : Char = validRanges[0].start
var value : Char = if (isUnsatisfiable()) '\u0000' else validRanges[0].start

/**
* Whether to output the character in uppercase.
* Only meaningful when flags.caseInsensitive is true.
*/
var useUpperCase: Boolean = false

override fun isUnsatisfiable(): Boolean = validRanges.isEmpty

override fun checkForLocallyValidIgnoringChildren() : Boolean{
return validRanges.any {
value in it ||
Expand All @@ -49,6 +51,9 @@ class CharacterRangeRxGene(
}

override fun isMutable(): Boolean {
if (isUnsatisfiable()) {
return false
}
// check if there is more than one character or if the character is caseable
return validRanges.charCount > 1 || flags.isCaseable(value)
}
Expand Down Expand Up @@ -134,6 +139,9 @@ class CharacterRangeRxGene(
TODO should \ be handled specially?
In any case, would have same handling as AnyCharacterRxGene
*/
if (isUnsatisfiable()) {
throw IllegalStateException("Cannot get value from empty CharacterRange")
}
return if (!flags.isCaseable(value)) {
value.toString()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ class DisjunctionRxGene(
private val log : Logger = LoggerFactory.getLogger(DisjunctionRxGene::class.java)
}

override fun isUnsatisfiable(): Boolean =
terms.isNotEmpty() && terms.any { (it as? RxTerm)?.isUnsatisfiable() == true }

override fun checkForLocallyValidIgnoringChildren() : Boolean{
return true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ class QuantifierRxGene(
if (min < 0) {
throw IllegalArgumentException("Invalid min value '$min': should be positive")
}
if (max < 1) {
throw IllegalArgumentException("Invalid max value '$max': should be at least 1")
if (max < 0) {
throw IllegalArgumentException("Invalid max value '$max': should be positive")
}
if (min > max) {
throw IllegalArgumentException("Invalid min-max values '$min-$max': min is greater than max")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,11 @@ import org.evomaster.core.search.StructuralElement
import org.evomaster.core.search.gene.Gene


interface RxTerm
interface RxTerm {
/**
* Returns true if this gene can never produce a valid value,
* for example an empty character class intersection like [a&&b].
* Used at construction time to filter unsatisfiable branches from disjunctions.
*/
fun isUnsatisfiable(): Boolean = false
}
Loading