|
JavaScript 2.0
Formal Description
Regular Expression Semantics
|
Sunday, May 16, 1999
The regular expression semantics describe the actions the regular expression engine takes in order to transform a regular expression pattern into a function for matching against input strings. For convenience, the regular expression grammar is repeated here. See also the description of the semantic notation.
This document is also available as a Word 98 rtf file.
Please excuse the mess. The regular expression semantics below are working (except for multiline and case-insensitive matches) and have been tried on sample cases, but they could be cleaned up somewhat and formatted better.
lineTerminators : {Character} = {‘«LF»’, ‘«CR»’, ‘«u2028»’, ‘«u2029»’}
nonTerminators : {Character} = {‘«NUL»’ ... ‘«uFFFF»’} - lineTerminators
reWhitespaces : {Character} = {‘«FF»’, ‘«LF»’, ‘«CR»’, ‘«TAB»’, ‘«VT»’, ‘ ’}
reDigits : {Character} = {‘0’ ... ‘9’}
reWordCharacters : {Character} = {‘0’ ... ‘9’, ‘A’ ... ‘Z’, ‘a’ ... ‘z’, ‘_’}
type REInput = tuple {str: String; ignoreCase: Boolean; multiline: Boolean}
Field str is the input string. ignoreCase and multiline are the corresponding regular expression flags.
type REResult = oneof {success: REMatch; failure}
type REMatch = tuple {endIndex: Integer; captured: Capture[]}
A REMatch holds an intermediate state during the pattern-matching process. endIndex is the index of the next input character to be matched by the next component in a regular expression pattern. If we are at the end of the pattern, endIndex is one plus the index of the last matched input character. captured is a zero-based array of the strings captured so far by capturing parentheses.
type Capture = oneof {present: String; absent}
type Continuation = REMatch REResult
A Continuation is a function that attempts to match the remaining portion of the pattern against the input string, starting at the intermediate state given by its REMatch argument. If a match is possible, it returns a success result that contains the final REMatch state; if no match is possible, it returns a failure result.
type Matcher = REInput REMatch Integer Continuation REResult
A Matcher is a function that attempts to match a middle portion of the pattern against the input string, starting at the intermediate state given by its REMatch argument. Since the remainder of the pattern heavily influences whether (and how) a middle portion will match, we must pass in a Continuation function that checks whether the rest of the pattern matched. If the continuation returns failure, the matcher function may call it repeatedly, trying various alternatives at pattern choice points.
The REInput parameter contains the input string and is merely passed down to subroutines. The Integer parameter contains the number of capturing left parentheses seen so far in the pattern and is used to assign static, consecutive numbers to capturing parentheses.
sequenceMatcher(matcher1: Matcher, parenCount1: Integer, matcher2: Matcher) : Matcher
= function(t: REInput, x: REMatch, p: Integer, c: Continuation)
let d: Continuation
= function(y: REMatch)
matcher2(t, y, p + parenCount1, c)
in matcher1(t, x, p, d)
sequenceMatcher returns a Matcher that matches the concatenation of the patterns matched by matcher1 and matcher2. parenCount1 is the number of capturing left parentheses inside matcher2's pattern.
characterSetMatcher(acceptanceSet: {Character}) : Matcher
= function(t: REInput, x: REMatch, p: Integer, c: Continuation)
let i: Integer = x.endIndex;
s: String = t.str
in if i = length(s)
then failure
else if s[i] acceptanceSet
then c(endIndex (i + 1), captured x.captured)
else failure
characterSetMatcher returns a Matcher that matches a single input string character. The match succeeds if the character is a member of the acceptanceSet set of characters (possibly ignoring case).
characterMatcher(ch: Character) : Matcher = characterSetMatcher({ch})
characterMatcher returns a Matcher that matches a single input string character. The match succeeds if the character is the same as ch (possibly ignoring case).
action Exec[RegularExpressionPattern] : REInput Integer REResult
Exec[RegularExpressionPattern Disjunction](t: REInput, index: Integer)
= Match[Disjunction](
t,
endIndex index, captured fillCapture(CountParens[Disjunction]),
0,
(function(x: REMatch)
success x))
fillCapture(i: Integer) : Capture[]
= if i = 0
then []Capture
else fillCapture(i - 1) [absent]
action Match[Disjunction] : Matcher
Match[Disjunction Alternativenormal] = Match[Alternativenormal]
Match[Disjunction Alternativenormal | Disjunction1]
(t: REInput, x: REMatch, p: Integer, c: Continuation)
= case Match[Alternativenormal](t, x, p, c) of
success(y: REMatch): success y;
failure: Match[Disjunction1](t, x, p + CountParens[Alternativenormal], c)
end
action CountParens[Disjunction] : Integer
CountParens[Disjunction Alternativenormal] = CountParens[Alternativenormal]
CountParens[Disjunction Alternativenormal | Disjunction1]
= CountParens[Alternativenormal] + CountParens[Disjunction1]
*+?type Limit = oneof {finite: Integer; infinite}
resetParens(x: REMatch, p: Integer, nParens: Integer) : REMatch
= if nParens = 0
then x
else let y: REMatch = endIndex x.endIndex, captured x.captured[p absent]
in resetParens(y, p + 1, nParens - 1)
repeatMatcher(body: Matcher, min: Integer, max: Limit, lazy: Boolean, nBodyParens: Integer)
: Matcher
= function(t: REInput, x: REMatch, p: Integer, c: Continuation)
if
(case max of
finite(m: Integer): m = 0;
infinite: false
end)
then c(x)
else let d: Continuation
= function(y: REMatch)
if min = 0 and (max is infinite and y.endIndex = x.endIndex)
then failure
else let newMin: Integer
= if min = 0
then 0
else min - 1;
newMax: Limit
= case max of
finite(m: Integer): finite (m - 1);
infinite: infinite
end
in repeatMatcher(body, newMin, newMax, lazy, nBodyParens)(
t,
y,
p,
c);
xr: REMatch = resetParens(x, p, nBodyParens)
in if min 0
then body(t, xr, p, d)
else if lazy
then case c(x) of
success(z: REMatch): success z;
failure: body(t, xr, p, d)
end
else case body(t, xr, p, d) of
success(z: REMatch): success z;
failure: c(x)
end
action Match[Alternativel] : Matcher
Match[Alternativel «empty»](t: REInput, x: REMatch, p: Integer, c: Continuation) = c(x)
Match[Alternativel Assertion Alternativenormal1]
(t: REInput, x: REMatch, p: Integer, c: Continuation)
= if TestAssertion[Assertion](t, x)
then Match[Alternativenormal1](t, x, p, c)
else failure
Match[Alternativel OrdinaryAtoml Alternativenormal1]
= sequenceMatcher(
Match[OrdinaryAtoml],
CountParens[OrdinaryAtoml],
Match[Alternativenormal1])
Match[Alternativel OneDigitEscape AlternativenonDecimalDigit1]
= sequenceMatcher(Match[OneDigitEscape], 0, Match[AlternativenonDecimalDigit1])
Match[Alternativel ShortOctalEscape AlternativenonOctalDigit1]
= sequenceMatcher(Match[ShortOctalEscape], 0, Match[AlternativenonOctalDigit1])
Match[Alternativel Atoml Quantifier Alternativenormal1]
= let min: Integer = Minimum[Quantifier];
max: Limit = Maximum[Quantifier];
lazy: Boolean = Lazy[Quantifier]
in if
(case max of
finite(m: Integer): m < min;
infinite: false
end)
then
else let looper: Matcher
= repeatMatcher(Match[Atoml], min, max, lazy, CountParens[Atoml])
in sequenceMatcher(looper, CountParens[Atoml], Match[Alternativenormal1])
action CountParens[Alternativel] : Integer
CountParens[Alternativel «empty»] = 0
CountParens[Alternativel Assertion Alternativenormal1]
= CountParens[Alternativenormal1]
CountParens[Alternativel OrdinaryAtoml Alternativenormal1]
= CountParens[OrdinaryAtoml] + CountParens[Alternativenormal1]
CountParens[Alternativel OneDigitEscape AlternativenonDecimalDigit1]
= CountParens[AlternativenonDecimalDigit1]
CountParens[Alternativel ShortOctalEscape AlternativenonOctalDigit1]
= CountParens[AlternativenonOctalDigit1]
CountParens[Alternativel Atoml Quantifier Alternativenormal1]
= CountParens[Atoml] + CountParens[Alternativenormal1]
action Minimum[Quantifier] : Integer
Minimum[Quantifier QuantifierPrefix] = Minimum[QuantifierPrefix]
Minimum[Quantifier QuantifierPrefix ?] = Minimum[QuantifierPrefix]
action Maximum[Quantifier] : Limit
Maximum[Quantifier QuantifierPrefix] = Maximum[QuantifierPrefix]
Maximum[Quantifier QuantifierPrefix ?] = Maximum[QuantifierPrefix]
action Lazy[Quantifier] : Boolean
Lazy[Quantifier QuantifierPrefix] = false
Lazy[Quantifier QuantifierPrefix ?] = true
action Minimum[QuantifierPrefix] : Integer
Minimum[QuantifierPrefix *] = 0
Minimum[QuantifierPrefix +] = 1
Minimum[QuantifierPrefix ?] = 0
Minimum[QuantifierPrefix { DecimalDigits }] = IntegerValue[DecimalDigits]
Minimum[QuantifierPrefix { DecimalDigits , }] = IntegerValue[DecimalDigits]
Minimum[QuantifierPrefix { DecimalDigits1 , DecimalDigits2 }]
= IntegerValue[DecimalDigits1]
action Maximum[QuantifierPrefix] : Limit
Maximum[QuantifierPrefix *] = infinite
Maximum[QuantifierPrefix +] = infinite
Maximum[QuantifierPrefix ?] = finite 1
Maximum[QuantifierPrefix { DecimalDigits }] = finite IntegerValue[DecimalDigits]
Maximum[QuantifierPrefix { DecimalDigits , }] = infinite
Maximum[QuantifierPrefix { DecimalDigits1 , DecimalDigits2 }]
= finite IntegerValue[DecimalDigits2]
action IntegerValue[DecimalDigits] : Integer
IntegerValue[DecimalDigits DecimalDigit] = DigitValue[DecimalDigit]
IntegerValue[DecimalDigits DecimalDigits1 DecimalDigit]
= 10*IntegerValue[DecimalDigits1] + DigitValue[DecimalDigit]
action DigitValue[DecimalDigit] : Integer = digitValue(DecimalDigit)
action TestAssertion[Assertion] : REInput REMatch Boolean
TestAssertion[Assertion ^](t: REInput, x: REMatch) = x.endIndex = 0
TestAssertion[Assertion $](t: REInput, x: REMatch) = x.endIndex = length(t.str)
TestAssertion[Assertion \ b](t: REInput, x: REMatch)
= atWordBoundary(x.endIndex, t.str)
TestAssertion[Assertion \ B](t: REInput, x: REMatch)
= not atWordBoundary(x.endIndex, t.str)
atWordBoundary(i: Integer, s: String) : Boolean
= i = 0 or (i = length(s) or (s[i - 1] reWordCharacters xor s[i] reWordCharacters))
.Match[Atoml OrdinaryAtoml] = Match[OrdinaryAtoml]
Match[Atoml OneDigitEscape] = Match[OneDigitEscape]
Match[Atoml ShortOctalEscape] = Match[ShortOctalEscape]
action CountParens[Atoml] : Integer
CountParens[Atoml OrdinaryAtoml] = CountParens[OrdinaryAtoml]
CountParens[Atoml OneDigitEscape] = 0
CountParens[Atoml ShortOctalEscape] = 0
action Match[OrdinaryAtoml] : Matcher
Match[OrdinaryAtoml CompoundAtom] = Match[CompoundAtom]
Match[OrdinaryAtoml PatternCharacterl] = characterMatcher(PatternCharacterl)
action CountParens[OrdinaryAtoml] : Integer
CountParens[OrdinaryAtoml CompoundAtom] = CountParens[CompoundAtom]
CountParens[OrdinaryAtoml PatternCharacterl] = 0
action Match[CompoundAtom] : Matcher
Match[CompoundAtom ( Disjunction )]
(t: REInput, x: REMatch, p: Integer, c: Continuation)
= let d: Continuation
= function(y: REMatch)
let updatedCaptured: Capture[]
= y.captured[p present t.str[x.endIndex ... y.endIndex - 1]]
in c(endIndex y.endIndex, captured updatedCaptured)
in Match[Disjunction](t, x, p + 1, d)
Match[CompoundAtom ( ? : Disjunction )] = Match[Disjunction]
Match[CompoundAtom .] = characterSetMatcher(nonTerminators)
Match[CompoundAtom CharacterClass](t: REInput, x: REMatch, p: Integer, c: Continuation)
= let a: {Character} = AcceptanceSet[CharacterClass](length(x.captured))
in characterSetMatcher(a)(t, x, p, c)
Match[CompoundAtom CharacterEscape] = characterMatcher(CharacterValue[CharacterEscape])
Match[CompoundAtom TwoDigitEscape] = Match[TwoDigitEscape]
Match[CompoundAtom CharacterClassEscape]
= characterSetMatcher(AcceptanceSet[CharacterClassEscape])
action CountParens[CompoundAtom] : Integer
CountParens[CompoundAtom ( Disjunction )] = CountParens[Disjunction] + 1
CountParens[CompoundAtom ( ? : Disjunction )] = CountParens[Disjunction]
CountParens[CompoundAtom .] = 0
CountParens[CompoundAtom CharacterClass] = 0
CountParens[CompoundAtom CharacterEscape] = 0
CountParens[CompoundAtom TwoDigitEscape] = 0
CountParens[CompoundAtom CharacterClassEscape] = 0
A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W | X | Y | Za | b | c | d | e | f | g | h | i | j | k | l | m | n | o | p | q | r | s | t | u | v | w | x | y | zaction CharacterValue[CharacterEscape] : Character
CharacterValue[CharacterEscape ControlEscape] = CharacterValue[ControlEscape]
CharacterValue[CharacterEscape \ c ControlLetter]
= codeToCharacter(bitwiseAnd(characterToCode(ControlLetter), 31))
CharacterValue[CharacterEscape ThreeDigitEscape] = CharacterValue[ThreeDigitEscape]
CharacterValue[CharacterEscape HexEscape] = CharacterValue[HexEscape]
CharacterValue[CharacterEscape \ IdentityEscape] = IdentityEscape
action CharacterValue[ControlEscape] : Character
CharacterValue[ControlEscape \ f] = ‘«FF»’
CharacterValue[ControlEscape \ n] = ‘«LF»’
CharacterValue[ControlEscape \ r] = ‘«CR»’
CharacterValue[ControlEscape \ t] = ‘«TAB»’
CharacterValue[ControlEscape \ v] = ‘«VT»’
backreferenceMatcher(n: Integer) : Matcher
= function(t: REInput, x: REMatch, p: Integer, c: Continuation)
case nthBackreference(x, n) of
present(ref: String):
let i: Integer = x.endIndex;
s: String = t.str
in let j: Integer = i + length(ref)
in if j > length(s)
then failure
else if s[i ... j - 1] = ref
then c(endIndex j, captured x.captured)
else failure;
absent: failure
end
nthBackreference(x: REMatch, n: Integer) : Capture
= if n > 0 and n length(x.captured)
then x.captured[n - 1]
else
action Match[OneDigitEscape] : Matcher
Match[OneDigitEscape \ DecimalDigit]
= let n: Integer = DigitValue[DecimalDigit]
in if n = 0
then characterMatcher(‘«NUL»’)
else backreferenceMatcher(n)
action CharacterValue[ShortOctalEscape] : Integer Character
CharacterValue[ShortOctalEscape \ ZeroToThree OctalDigit](nCapturingParens: Integer)
= let n: Integer = 10*DigitValue[ZeroToThree] + DigitValue[OctalDigit]
in if n 10 and n nCapturingParens
then
else codeToCharacter(8*DigitValue[ZeroToThree] + DigitValue[OctalDigit])
action Match[ShortOctalEscape] : Matcher
Match[ShortOctalEscape \ ZeroToThree OctalDigit]
(t: REInput, x: REMatch, p: Integer, c: Continuation)
= let n: Integer = 10*DigitValue[ZeroToThree] + DigitValue[OctalDigit]
in if n 10 and n length(x.captured)
then backreferenceMatcher(n)(t, x, p, c)
else characterMatcher(
codeToCharacter(8*DigitValue[ZeroToThree] + DigitValue[OctalDigit]))(
t,
x,
p,
c)
action Match[TwoDigitEscape] : Matcher
Match[TwoDigitEscape \ ZeroToThree EightOrNine]
= backreferenceMatcher(10*DigitValue[ZeroToThree] + DigitValue[EightOrNine])
Match[TwoDigitEscape \ FourToNine DecimalDigit]
= backreferenceMatcher(10*DigitValue[FourToNine] + DigitValue[DecimalDigit])
action CharacterValue[ThreeDigitEscape] : Character
CharacterValue[ThreeDigitEscape \ ZeroToThree OctalDigit1 OctalDigit2]
= codeToCharacter(
64*DigitValue[ZeroToThree] + 8*DigitValue[OctalDigit1] + DigitValue[OctalDigit2])
action DigitValue[ZeroToThree] : Integer = digitValue(ZeroToThree)
action DigitValue[FourToNine] : Integer = digitValue(FourToNine)
action DigitValue[OctalDigit] : Integer = digitValue(OctalDigit)
action DigitValue[EightOrNine] : Integer = digitValue(EightOrNine)
action CharacterValue[HexEscape] : Character
CharacterValue[HexEscape \ x HexDigit1 HexDigit2]
= codeToCharacter(16*HexValue[HexDigit1] + HexValue[HexDigit2])
CharacterValue[HexEscape \ u HexDigit1 HexDigit2 HexDigit3 HexDigit4]
= codeToCharacter(
4096*HexValue[HexDigit1] + 256*HexValue[HexDigit2] + 16*HexValue[HexDigit3] +
HexValue[HexDigit4])
action DigitValue[HexDigit] : Integer = digitValue(HexDigit)
action AcceptanceSet[CharacterClassEscape] : {Character}
AcceptanceSet[CharacterClassEscape \ s] = reWhitespaces
AcceptanceSet[CharacterClassEscape \ S] = {‘«NUL»’ ... ‘«uFFFF»’} - reWhitespaces
AcceptanceSet[CharacterClassEscape \ d] = reDigits
AcceptanceSet[CharacterClassEscape \ D] = {‘«NUL»’ ... ‘«uFFFF»’} - reDigits
AcceptanceSet[CharacterClassEscape \ w] = reWordCharacters
AcceptanceSet[CharacterClassEscape \ W] = {‘«NUL»’ ... ‘«uFFFF»’} - reWordCharacters
action AcceptanceSet[CharacterClass] : Integer {Character}
AcceptanceSet[CharacterClass [ ClassRangesnoCaret ]](nCapturingParens: Integer)
= AcceptanceSet[ClassRangesnoCaret](nCapturingParens)
AcceptanceSet[CharacterClass [ ^ ClassRangesany ]](nCapturingParens: Integer)
= {‘«NUL»’ ... ‘«uFFFF»’} - AcceptanceSet[ClassRangesany](nCapturingParens)
action AcceptanceSet[ClassRangess] : Integer {Character}
AcceptanceSet[ClassRangess «empty»](nCapturingParens: Integer) = {}Character
AcceptanceSet[ClassRangess RangeLists,normal](nCapturingParens: Integer)
= AcceptanceSet[RangeLists,normal](nCapturingParens)
action AcceptanceSet[RangeLists,l] : Integer {Character}
AcceptanceSet[RangeLists,l FinalRangeAtoms,l](nCapturingParens: Integer)
= AcceptanceSet[FinalRangeAtoms,l](nCapturingParens)
AcceptanceSet[RangeLists,l OrdinaryRangeAtoms,l RangeListSuffixnormal]
(nCapturingParens: Integer)
= let a: {Character} = AcceptanceSet[OrdinaryRangeAtoms,l]
in AcceptanceSet[RangeListSuffixnormal](nCapturingParens, a)
AcceptanceSet[RangeLists,l ZeroEscape RangeListSuffixnonDecimalDigit]
(nCapturingParens: Integer)
= let a: {Character} = {CharacterValue[ZeroEscape]}
in AcceptanceSet[RangeListSuffixnonDecimalDigit](nCapturingParens, a)
AcceptanceSet[RangeLists,l ShortOctalEscape RangeListSuffixnonOctalDigit]
(nCapturingParens: Integer)
= let a: {Character} = {CharacterValue[ShortOctalEscape](nCapturingParens)}
in AcceptanceSet[RangeListSuffixnonOctalDigit](nCapturingParens, a)
action AcceptanceSet[RangeListSuffixl] : Integer {Character} {Character}
AcceptanceSet[RangeListSuffixl RangeListnoDash,l]
(nCapturingParens: Integer, low: {Character})
= low AcceptanceSet[RangeListnoDash,l](nCapturingParens)
AcceptanceSet[RangeListSuffixl - RangeAtomany,normal]
(nCapturingParens: Integer, low: {Character})
= characterRange(low, AcceptanceSet[RangeAtomany,normal](nCapturingParens))
AcceptanceSet[RangeListSuffixl - OrdinaryRangeAtomany,normal RangeListany,normal]
(nCapturingParens: Integer, low: {Character})
= let range: {Character}
= characterRange(low, AcceptanceSet[OrdinaryRangeAtomany,normal])
in range AcceptanceSet[RangeListany,normal](nCapturingParens)
AcceptanceSet[RangeListSuffixl - ZeroEscape RangeListany,nonDecimalDigit]
(nCapturingParens: Integer, low: {Character})
= let range: {Character} = characterRange(low, {CharacterValue[ZeroEscape]})
in range AcceptanceSet[RangeListany,nonDecimalDigit](nCapturingParens)
AcceptanceSet[RangeListSuffixl - ShortOctalEscape RangeListany,nonOctalDigit]
(nCapturingParens: Integer, low: {Character})
= let range: {Character}
= characterRange(low, {CharacterValue[ShortOctalEscape](nCapturingParens)})
in range AcceptanceSet[RangeListany,nonOctalDigit](nCapturingParens)
characterRange(low: {Character}, high: {Character}) : {Character}
= if (|low|) 1 or (|high|) 1
then
else let l: Character = min low;
h: Character = min high
in if l h
then {l ... h}
else
action AcceptanceSet[FinalRangeAtoms,l] : Integer {Character}
AcceptanceSet[FinalRangeAtomany,l RangeAtomany,l](nCapturingParens: Integer)
= AcceptanceSet[RangeAtomany,l](nCapturingParens)
AcceptanceSet[FinalRangeAtomnoCaret,l RangeAtomnoCaret,l](nCapturingParens: Integer)
= AcceptanceSet[RangeAtomnoCaret,l](nCapturingParens)
AcceptanceSet[FinalRangeAtomnoDash,l RangeAtomany,l](nCapturingParens: Integer)
= AcceptanceSet[RangeAtomany,l](nCapturingParens)
action AcceptanceSet[RangeAtoms,l] : Integer {Character}
AcceptanceSet[RangeAtoms,l OrdinaryRangeAtoms,l](nCapturingParens: Integer)
= AcceptanceSet[OrdinaryRangeAtoms,l]
AcceptanceSet[RangeAtoms,l ZeroEscape](nCapturingParens: Integer)
= {CharacterValue[ZeroEscape]}
AcceptanceSet[RangeAtoms,l ShortOctalEscape](nCapturingParens: Integer)
= {CharacterValue[ShortOctalEscape](nCapturingParens)}
action AcceptanceSet[OrdinaryRangeAtoms,l] : {Character}
AcceptanceSet[OrdinaryRangeAtoms,l RangeCharacters,l] = {RangeCharacters,l}
AcceptanceSet[OrdinaryRangeAtoms,l RangeEscape] = AcceptanceSet[RangeEscape]
action AcceptanceSet[RangeEscape] : {Character}
AcceptanceSet[RangeEscape \ b] = {‘«BS»’}
AcceptanceSet[RangeEscape CharacterEscape] = {CharacterValue[CharacterEscape]}
AcceptanceSet[RangeEscape CharacterClassEscape] = AcceptanceSet[CharacterClassEscape]
action CharacterValue[ZeroEscape] : Character
CharacterValue[ZeroEscape \ 0] = ‘«NUL»’