use of org.matheclipse.core.expression.RepeatedPattern in project symja_android_library by axkr.
the class StringFunctions method toRegexString.
/**
* Convert a Symja expression which represents a 'piece of a regular expression' to a Java regular
* expression string.
*
* @param partOfRegex the expression which represents a regex 'piece' which must be converted to a
* Java regex string
* @param abbreviatedPatterns if <code>true</code> allow 'abbreviated patterns" in strings (i.e.
* '\','*' and '@' operators)
* @param stringFunction the original string function, used in error messages
* @param shortestLongest either {@link #REGEX_LONGEST} or {@link #REGEX_SHORTEST}
* @param groups
* @param engine the evaluation engine
* @return
* @see <a href="https://en.wikipedia.org/wiki/Perl_Compatible_Regular_Expressions">Wikipedia -
* Perl Compatible Regular Expression</a>
*/
private static String toRegexString(IExpr partOfRegex, boolean abbreviatedPatterns, IAST stringFunction, String[] shortestLongest, Map<ISymbol, String> groups, EvalEngine engine) {
if (partOfRegex.isString()) {
final String str = partOfRegex.toString();
if (abbreviatedPatterns) {
StringBuilder pieces = new StringBuilder();
int beginIndex = 0;
int endIndex = 0;
final int len = str.length();
while (endIndex < len) {
char c = str.charAt(endIndex);
if (c == '\\' && endIndex + 1 < len) {
pieces.append(Pattern.quote(str.substring(beginIndex, endIndex)));
pieces.append(Pattern.quote(str.substring(endIndex + 1, endIndex + 2)));
endIndex += 2;
beginIndex = endIndex;
} else if (c == '*') {
pieces.append(Pattern.quote(str.substring(beginIndex, endIndex)));
pieces.append("(.*)");
endIndex += 1;
beginIndex = endIndex;
} else if (c == '@') {
pieces.append(Pattern.quote(str.substring(beginIndex, endIndex)));
// one or more characters, excluding upper case letters
pieces.append("([^A-Z]+)");
endIndex += 1;
beginIndex = endIndex;
} else {
endIndex += 1;
}
}
pieces.append(Pattern.quote(str.substring(beginIndex, endIndex)));
return pieces.toString();
} else {
return Pattern.quote(str);
}
} else if (partOfRegex.isAST(S.Characters, 2) && partOfRegex.first().isString()) {
String str = ((IStringX) partOfRegex.first()).toString();
return "[" + str + "]";
} else if (partOfRegex.isAST(S.RegularExpression, 2) && partOfRegex.first().isString()) {
return ((IStringX) partOfRegex.first()).toString();
} else if (partOfRegex instanceof RepeatedPattern) {
RepeatedPattern repeated = (RepeatedPattern) partOfRegex;
IExpr expr = repeated.getRepeatedExpr();
if (expr == null) {
return null;
}
if (expr.isAST(S.Pattern, 3) && expr.first().isSymbol()) {
final ISymbol symbol = (ISymbol) expr.first();
String str = toRegexString(expr.second(), abbreviatedPatterns, stringFunction, shortestLongest, groups, engine);
if (str != null) {
final String groupName = symbol.toString();
groups.put(symbol, groupName);
if (repeated.isNullSequence()) {
return "(?<" + groupName + ">(" + str + ")" + shortestLongest[ASTERISK_Q] + ")";
} else {
return "(?<" + groupName + ">(" + str + ")" + shortestLongest[PLUS_Q] + ")";
}
}
} else {
String str = toRegexString(expr, abbreviatedPatterns, stringFunction, shortestLongest, groups, engine);
if (str != null) {
if (repeated.isNullSequence()) {
return "(" + str + ")" + shortestLongest[ASTERISK_Q];
} else {
return "(" + str + ")" + shortestLongest[PLUS_Q];
}
}
}
} else if (partOfRegex.isAST(S.StringExpression)) {
IAST stringExpression = (IAST) partOfRegex;
return toRegexString(stringFunction, stringExpression, abbreviatedPatterns, shortestLongest, groups, engine);
} else if (partOfRegex.isBlank()) {
return "(.|\\n)";
} else if (partOfRegex.isPattern()) {
final IPattern pattern = (IPattern) partOfRegex;
final ISymbol symbol = pattern.getSymbol();
if (symbol != null && pattern.getHeadTest() == null) {
// see github #221 - use Java regex - named capturing groups
final String groupName = symbol.toString();
groups.put(symbol, groupName);
if (pattern instanceof PatternNested) {
PatternNested pn = (PatternNested) pattern;
IExpr subPattern = pn.getPatternExpr();
String subPatternRegex = toRegexString(subPattern, abbreviatedPatterns, stringFunction, shortestLongest, groups, engine);
return "(?<" + groupName + ">" + subPatternRegex + ")";
}
return "(?<" + groupName + ">(.|\\n))";
}
} else if (partOfRegex.isAST(S.Pattern, 3) && partOfRegex.first().isSymbol()) {
final ISymbol symbol = (ISymbol) partOfRegex.first();
String str = toRegexString(partOfRegex.second(), abbreviatedPatterns, stringFunction, shortestLongest, groups, engine);
if (str != null) {
final String groupName = symbol.toString();
groups.put(symbol, groupName);
return "(?<" + groupName + ">" + str + ")";
}
} else if (partOfRegex.isPatternSequence(false)) {
PatternSequence ps = ((PatternSequence) partOfRegex);
final ISymbol symbol = ps.getSymbol();
final String str;
if (ps.isNullSequence()) {
// RepeatedNull
str = "(.|\\n)" + shortestLongest[ASTERISK_Q];
} else {
// Repeated
str = "(.|\\n)" + shortestLongest[PLUS_Q];
}
if (symbol == null) {
return str;
} else {
final String groupName = symbol.toString();
groups.put(symbol, groupName);
return "(?<" + groupName + ">" + str + ")";
}
} else if (partOfRegex.isAST(S.CharacterRange, 3)) {
String[] characterRange = characterRange((IAST) partOfRegex);
if (characterRange != null) {
StringBuilder buf = new StringBuilder();
buf.append("[");
buf.append(Pattern.quote(characterRange[0]));
buf.append("-");
buf.append(Pattern.quote(characterRange[1]));
buf.append("]");
return buf.toString();
}
} else if (partOfRegex.isAlternatives()) {
IAST alternatives = (IAST) partOfRegex;
StringBuilder pieces = new StringBuilder();
for (int i = 1; i < alternatives.size(); i++) {
String str = toRegexString(alternatives.get(i), abbreviatedPatterns, stringFunction, shortestLongest, groups, engine);
if (str == null) {
// `1` currently not supported in `2`.
IOFunctions.printMessage(stringFunction.topHead(), "unsupported", F.list(alternatives.get(i), stringFunction.topHead()), engine);
return null;
}
pieces.append(str);
if (i < alternatives.size() - 1) {
pieces.append('|');
}
}
return pieces.toString();
} else if (partOfRegex.isAST(S.Shortest, 2)) {
String str = toRegexString(partOfRegex.first(), abbreviatedPatterns, stringFunction, REGEX_SHORTEST, groups, engine);
return str;
} else if (partOfRegex.isAST(S.Longest, 2)) {
return toRegexString(partOfRegex.first(), abbreviatedPatterns, stringFunction, REGEX_LONGEST, groups, engine);
} else if (partOfRegex.isBuiltInSymbol()) {
int ordinal = ((IBuiltInSymbol) partOfRegex).ordinal();
switch(ordinal) {
case ID.NumberString:
// better suitable for StringSplit?
return "[0-9]{1,13}(\\.[0-9]+)?";
// return "[-|+]?(\\d+(\\.\\d*)?|\\.\\d+)?";
case ID.Whitespace:
return "(?u)\\s+";
case ID.DigitCharacter:
return "\\d";
case ID.WhitespaceCharacter:
return "(?u)\\s";
case ID.WordCharacter:
return "(?u)[^\\W_]";
case ID.StartOfLine:
return "\\R";
case ID.EndOfLine:
return "$";
case ID.StartOfString:
return "\\A";
case ID.EndOfString:
return "\\Z";
case ID.WordBoundary:
return "\\b";
case ID.LetterCharacter:
return "(?u)[^\\W_0-9]";
case ID.HexidecimalCharacter:
return "[0-9a-fA-F]";
default:
// `1` currently not supported in `2`.
IOFunctions.printMessage(stringFunction.topHead(), "unsupported", F.list(partOfRegex, stringFunction.topHead()), engine);
return null;
}
}
// `1` currently not supported in `2`.
IOFunctions.printMessage(stringFunction.topHead(), "unsupported", F.list(partOfRegex, stringFunction.topHead()), engine);
return null;
}
Aggregations