/* (c) https://github.com/MontiCore/monticore */ package de.monticore.regex; /* This is a MontiCore stable grammar. * Adaptations -- if any -- are conservative. */ import de.monticore.literals.*; /** * This grammar defines regular expressions over UTF characters, such as * a*b+a?cc * [A-F0-9]{4,6} * Hello_(World|Tom) * * The syntax used for this grammar is modeled after the syntax used * in Java, see e.g. * https://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html * * The grammar conforms to the specification at * https://en.wikipedia.org/wiki/Regular_expression#Formal_language_theory * * A detailed explanation of regular expressions can be found at * https://en.wikipedia.org/wiki/Regular_expression * * The grammar can be used in isolated form, but usually is embedded * in DSLs where a regular expression is of use, e.g. when * describing allowed input pattern. * * This grammar only relies on MontiCores literals. * Although nonterminals are called "RegularExpression", they are * defined independently of the expression grammars of MontiCore. * * The grammar uses a special mode, namely REGEX. * This mode is needed, because the parsing of regular expressions * and especially their tokens significantly differs from the normal * forms of tokens. * * The nonterminal RegularExpression is designed in such a way that * it assumes that the mode REGEX has already been switched on * (and will be switched of) by the calling nonterminal. * This functionality is provided by the nonterminal RegExLiteral. * */ component grammar RegularExpressions extends MCCommonLiterals { /*=================================================================*/ /*================== REGULAR EXPRESSION ===========================*/ /*=================================================================*/ /** * Represents a regular expression. "left" and "right" are only present * if regular expression comprises two parts split by a pipe. Otherwise, the * regular expression is made up of (non-empty) items. * Syntax e.g.: R1|R2 , R1R2R3 * Matches e.g.: alternatives and sequences */ RegularExpression = left:RegularExpression Pipe right:RegularExpression | RegExItem+ ; RegExLiteral = RegExStartToken RegularExpression RegExEndToken; token RegExStartToken = 'R' '"' : -> pushMode(REGEX); token RegExEndToken(REGEX) = '"' : -> popMode; /*=================================================================*/ /*========================= ITEM ==================================*/ /*=================================================================*/ /** * Interface for every character or symbol which can occur inside of * a regular expression. */ interface RegExItem; /*=================================================================*/ /*=================== BRACKET ITEMS [...] =========================*/ /*=================================================================*/ /** * This represents an (inversed) bracket expression. A single character * contained in the brackets is matched when evaluating the regular * expression. * Syntax e.g.: [R], [^R] * Matches e.g.: a single character (not) contained in R */ BracketRegEx implements RegExItem = LBrack Roof? BracketRegExItem* RBrack; /** * Interface for every character or symbol which can occur inside of a * bracket expression. */ interface BracketRegExItem; /** * Represents a normal character used in a bracket expression */ CharOption implements BracketRegExItem = CharacterInCharacterElement; /** * SpecialCharOption represents a single character * of the specials characters, which are usable in a bracket expression. * * Remark: * This nonterminal is needed because the token * CharacterInCharacterElement does not match special characters used * in other places of this grammar, as they have a higher priority than * the CharacterInCharacterElement token in the lexing process. */ SpecialCharOption implements BracketRegExItem = Point | Star | Plus | Comma | Roof | Pipe | Backslash | LCurly | RCurly | LBrack | Dollar | LParen | RParen | Question | SingleDigit; /** * Represents a range of characters or numbers. * * These are only characters and digits. */ CharRange implements BracketRegExItem = Range; /*=================================================================*/ /*=================== CAPTURING GROUPS ============================*/ /*=================================================================*/ /** * Bundles a part of a regular expression in parentheses to (optionally) use * it later in the same expression or to replace it with a new String. * Syntax : (R) * Matches : R and allows to reuse it later in the * regex via \n */ CapturingGroup implements RegExItem = LParen RegularExpression RParen; /*=================================================================*/ /** * Bundles a part of a regular expression in parentheses to (optionally) use * it later in the same expression or to replace it with a new String. * Giving the capturing group a name fosters its reusability. * Syntax : (?R) * Matches : R and allows to reuse it later via \k */ symbol NamedCapturingGroup implements RegExItem = NamedCapturingGroupStart Name NamedCapturingGroupEnd RegularExpression RParen; /** * Indicates the start of the name of a capturing group. Switches the lexer * mode to recognize a consecutive Name token. */ token NamedCapturingGroupStart(REGEX) = "(?<" : -> pushMode(DEFAULT_MODE); /** * Indicates the end of the name of a capturing group. Switches the lexer * mode to recognize consecutive regular expression tokens. */ token NamedCapturingGroupEnd = ">" : -> popMode; /*=================================================================*/ /** * Bundles a part of a regular expression that does not need to be reused * anywhere else, i.e. this a normal parentheses. * Syntax : (?:R) * Matches : R */ NonCapturingGroup implements RegExItem = NonCapturingGroupStart RegularExpression RParen; /** * Indicates the start of a non-capturing group. */ token NonCapturingGroupStart(REGEX) = "(?:"; /*=================================================================*/ /** * Refers to a capturing group. * Syntax e.g.: \0, \9, \k * Matches : what has been defined in a group before */ BackReference implements RegExItem = Backslash (SingleDigit | BackReferenceStart Name@NamedCapturingGroup NamedCapturingGroupEnd); /** * Indicates the start of the name of a capturing group. Switches * the lexer mode to recognize a consecutive Name token. */ token BackReferenceStart(REGEX) = "k<" : -> pushMode(DEFAULT_MODE); /*=================================================================*/ /*================== ITEM (cont.) =================================*/ /*=================================================================*/ /** * Represents a character used in a regular expression. * Syntax : a, b, Z, etc. * Matches : a, b, Z, etc. */ RegExChar implements RegExItem = CharacterInCharacterElement; /** * Represents a point in a regular expression which matches any character. * Syntax : . * Matches : a, ยง, 5, etc. */ RegExPoint implements RegExItem = Point; /** * Represents a single digit used in a regular expression. * Syntax : 0, 1, 9, etc. * Matches : 0, 1, 9, etc. */ RegExDigit implements RegExItem = SingleDigit; /** * Indicates that the consecutive regular expression should start * with a new line. * Syntax : ^R * Matches : newline|linefeed and then R */ StartOfLine implements RegExItem = Roof RegularExpression; /** * This allows to qualify an item with an allowed quantity or * marks it as the end of a line. * Syntax : R?, R+, R*, R$ * R{3,5}, R{4}, R{3,}, R{,5} * Matches : R in the appropriate quantity, * or R before the end of a line with R$ */ QualifiedRegEx implements RegExItem = RegExItem Qualification; /*=================================================================*/ /*==================== QUALIFICATION ==============================*/ /*=================================================================*/ /** * Interface for the qualifications allowed in regular expressions. */ interface Qualification; /** * All simple qualifications which are allowed after * a regular expression */ RegExQualification implements Qualification = Star | Plus | Question | Dollar; /** * ASTRangeQualification can be used after an item in a regular * expression if the item should be matched a specified amount of times. */ RangeQualification implements Qualification = LCurly (lowerBound:SingleDigit+)? Comma (upperBound:SingleDigit+)? RCurly; /** * ASTNumberQualification can be used after an item in a regular * expression if the item should be matched a specified amount of times. */ NumberQualification implements Qualification = LCurly SingleDigit+ RCurly; /*=================================================================*/ /*==================== ESCAPES FOR SINGLE CHARACTERS ==============*/ /*=================================================================*/ /** * ASTEscapeChar is an interface for all character descriptions using * an "\"-operator as escape. * Syntax : \p{ASCII}, \p{Cntrl}, \w, \W, \B, \S, ... * Matches : single characters */ interface EscapeChar extends RegExItem; /*=================================================================*/ /** * ASTSpecificChars allows to specify \p - escapes, matching a * single character withing a named (and thus predefined group): * Syntax : \p{ASCII}, \p{Cntrl}, ... * Matches : single characters * * The arguments usable for \p are: * ASCII, Alnum, Alpha, Blank, Cntrl, Digit, Graph, Lower, Print, * Punct, Space, Upper, XDigit */ SpecificChars implements EscapeChar = SpecificCharsStart SpecificCharsName; /** See SpecificChars nonterminal */ token SpecificCharsStart(REGEX) = "\\p" : -> pushMode(DEFAULT_MODE); /** See SpecificChars nonterminal */ token SpecificCharsName = "{" Name "}" : -> popMode; /*=================================================================*/ /*==================== SIMPLE MATCHERS ============================*/ /*=================================================================*/ /** * ASTRegExEscapeChar allows to specify all escapes with no arguments, * Syntax : \w, \W, \B, \S, ... * Matches e.g.: single characters */ RegExEscapeChar implements EscapeChar = AlphaNumCharsWithUnderscoreToken | NonWordCharsToken | WordBoundariesToken | NonWordBoundariesToken | DigitCharsToken | NonDigitCharsToken | WhitespaceCharsToken | NonWhitespaceCharsToken | Backslash; /** * Matches all alphanumeric characters as well as "_". * Syntax : \w * Matches e.g.: a, z, A, Z, 0, 9, _ */ token AlphaNumCharsWithUnderscoreToken(REGEX) = "\\w"; /** * Matches all non-alphanumeric characters except "_". * Syntax : \W * Matches e.g.: %, (, $, Space, Linefeed, etc. */ token NonWordCharsToken(REGEX) = "\\W"; /** * Matches all word boundaries. * A word boundary is not a character but a * position between two characters. * Syntax : \b * Matches e.g.: |Hello| |World| * ( where "|" indicates word boundary) */ token WordBoundariesToken(REGEX) = "\\b"; /** * Matches all non-word boundaries. * A non-word boundary is not a character but a * position between two characters. * Syntax : \B * Matches e.g.: H|e|l|l|o W|o|r|l|d * ( where "|" indicates non word boundary) */ token NonWordBoundariesToken(REGEX) = "\\B"; /** * Matches all digits. * Syntax : \s * Matches e.g.: 0, 4, 9 */ token DigitCharsToken(REGEX) = "\\d"; /** * Matches everything except digits. * Syntax : \D * Matches e.g.: _, a */ token NonDigitCharsToken(REGEX) = "\\D"; /** * Matches all whitespace characters. * Syntax : \s * Matches e.g.: Space, Linefeed, Newline, Tab */ token WhitespaceCharsToken(REGEX) = "\\s"; /** * Matches everything except whitespace characters. * Syntax : \S * Matches e.g.: 0, _, a */ token NonWhitespaceCharsToken(REGEX) = "\\S"; /*=================================================================*/ /*=============== CORE TOKENS FOR REGEX CONTROL ===================*/ /*=================================================================*/ /** * SingleDigit denotes a single digit */ token SingleDigit(REGEX) = Digit; /** * CharacterInCharacterElement describes any single character, except * the closing ')'. * Syntax e.g.: g, 0, / * Matches e.g.: g, 0, / (i.e. itself) */ token CharacterInCharacterElement(REGEX) = . ; /** * Range is used to parse a range of characters or numbers. * Syntax e.g.: g-j, 0-9 * Matches e.g.: a, 0, F */ token Range(REGEX) = ('a' .. 'z' | 'A' .. 'Z' | '0' .. '9') '-' ('a' .. 'z' | 'A' .. 'Z' | '0' .. '9'); /** * The following tokens are defined in this grammar to make * them accessible in the "REGEX" grammar mode. * */ token Pipe(REGEX) = "|"; token LBrack(REGEX) = "["; token RBrack(REGEX) = "]"; token Roof(REGEX) = "^"; token RParen(REGEX) = ")"; token LParen(REGEX) = "("; token Point(REGEX) = "."; token Backslash(REGEX) = "\\"; token Star(REGEX) = "*"; token Plus(REGEX) = "+"; token Question(REGEX) = "?"; token Comma(REGEX) = ","; token Dollar(REGEX) = "$"; token RCurly(REGEX) = "}"; token LCurly(REGEX) = "{"; }