Search in sources :

Example 1 with PatternSequence

use of org.matheclipse.core.expression.PatternSequence in project symja_android_library by axkr.

the class StringFunctions method toRegexString.

/**
 * Convert a Symja expression which represents a 'piece of a regular expression' to a Java regular
 * expression string.
 *
 * @param partOfRegex the expression which represents a regex 'piece' which must be converted to a
 *        Java regex string
 * @param abbreviatedPatterns if <code>true</code> allow 'abbreviated patterns" in strings (i.e.
 *        '\','*' and '@' operators)
 * @param stringFunction the original string function, used in error messages
 * @param shortestLongest either {@link #REGEX_LONGEST} or {@link #REGEX_SHORTEST}
 * @param groups
 * @param engine the evaluation engine
 * @return
 * @see <a href="https://en.wikipedia.org/wiki/Perl_Compatible_Regular_Expressions">Wikipedia -
 *      Perl Compatible Regular Expression</a>
 */
private static String toRegexString(IExpr partOfRegex, boolean abbreviatedPatterns, IAST stringFunction, String[] shortestLongest, Map<ISymbol, String> groups, EvalEngine engine) {
    if (partOfRegex.isString()) {
        final String str = partOfRegex.toString();
        if (abbreviatedPatterns) {
            StringBuilder pieces = new StringBuilder();
            int beginIndex = 0;
            int endIndex = 0;
            final int len = str.length();
            while (endIndex < len) {
                char c = str.charAt(endIndex);
                if (c == '\\' && endIndex + 1 < len) {
                    pieces.append(Pattern.quote(str.substring(beginIndex, endIndex)));
                    pieces.append(Pattern.quote(str.substring(endIndex + 1, endIndex + 2)));
                    endIndex += 2;
                    beginIndex = endIndex;
                } else if (c == '*') {
                    pieces.append(Pattern.quote(str.substring(beginIndex, endIndex)));
                    pieces.append("(.*)");
                    endIndex += 1;
                    beginIndex = endIndex;
                } else if (c == '@') {
                    pieces.append(Pattern.quote(str.substring(beginIndex, endIndex)));
                    // one or more characters, excluding upper case letters
                    pieces.append("([^A-Z]+)");
                    endIndex += 1;
                    beginIndex = endIndex;
                } else {
                    endIndex += 1;
                }
            }
            pieces.append(Pattern.quote(str.substring(beginIndex, endIndex)));
            return pieces.toString();
        } else {
            return Pattern.quote(str);
        }
    } else if (partOfRegex.isAST(S.Characters, 2) && partOfRegex.first().isString()) {
        String str = ((IStringX) partOfRegex.first()).toString();
        return "[" + str + "]";
    } else if (partOfRegex.isAST(S.RegularExpression, 2) && partOfRegex.first().isString()) {
        return ((IStringX) partOfRegex.first()).toString();
    } else if (partOfRegex instanceof RepeatedPattern) {
        RepeatedPattern repeated = (RepeatedPattern) partOfRegex;
        IExpr expr = repeated.getRepeatedExpr();
        if (expr == null) {
            return null;
        }
        if (expr.isAST(S.Pattern, 3) && expr.first().isSymbol()) {
            final ISymbol symbol = (ISymbol) expr.first();
            String str = toRegexString(expr.second(), abbreviatedPatterns, stringFunction, shortestLongest, groups, engine);
            if (str != null) {
                final String groupName = symbol.toString();
                groups.put(symbol, groupName);
                if (repeated.isNullSequence()) {
                    return "(?<" + groupName + ">(" + str + ")" + shortestLongest[ASTERISK_Q] + ")";
                } else {
                    return "(?<" + groupName + ">(" + str + ")" + shortestLongest[PLUS_Q] + ")";
                }
            }
        } else {
            String str = toRegexString(expr, abbreviatedPatterns, stringFunction, shortestLongest, groups, engine);
            if (str != null) {
                if (repeated.isNullSequence()) {
                    return "(" + str + ")" + shortestLongest[ASTERISK_Q];
                } else {
                    return "(" + str + ")" + shortestLongest[PLUS_Q];
                }
            }
        }
    } else if (partOfRegex.isAST(S.StringExpression)) {
        IAST stringExpression = (IAST) partOfRegex;
        return toRegexString(stringFunction, stringExpression, abbreviatedPatterns, shortestLongest, groups, engine);
    } else if (partOfRegex.isBlank()) {
        return "(.|\\n)";
    } else if (partOfRegex.isPattern()) {
        final IPattern pattern = (IPattern) partOfRegex;
        final ISymbol symbol = pattern.getSymbol();
        if (symbol != null && pattern.getHeadTest() == null) {
            // see github #221 - use Java regex - named capturing groups
            final String groupName = symbol.toString();
            groups.put(symbol, groupName);
            if (pattern instanceof PatternNested) {
                PatternNested pn = (PatternNested) pattern;
                IExpr subPattern = pn.getPatternExpr();
                String subPatternRegex = toRegexString(subPattern, abbreviatedPatterns, stringFunction, shortestLongest, groups, engine);
                return "(?<" + groupName + ">" + subPatternRegex + ")";
            }
            return "(?<" + groupName + ">(.|\\n))";
        }
    } else if (partOfRegex.isAST(S.Pattern, 3) && partOfRegex.first().isSymbol()) {
        final ISymbol symbol = (ISymbol) partOfRegex.first();
        String str = toRegexString(partOfRegex.second(), abbreviatedPatterns, stringFunction, shortestLongest, groups, engine);
        if (str != null) {
            final String groupName = symbol.toString();
            groups.put(symbol, groupName);
            return "(?<" + groupName + ">" + str + ")";
        }
    } else if (partOfRegex.isPatternSequence(false)) {
        PatternSequence ps = ((PatternSequence) partOfRegex);
        final ISymbol symbol = ps.getSymbol();
        final String str;
        if (ps.isNullSequence()) {
            // RepeatedNull
            str = "(.|\\n)" + shortestLongest[ASTERISK_Q];
        } else {
            // Repeated
            str = "(.|\\n)" + shortestLongest[PLUS_Q];
        }
        if (symbol == null) {
            return str;
        } else {
            final String groupName = symbol.toString();
            groups.put(symbol, groupName);
            return "(?<" + groupName + ">" + str + ")";
        }
    } else if (partOfRegex.isAST(S.CharacterRange, 3)) {
        String[] characterRange = characterRange((IAST) partOfRegex);
        if (characterRange != null) {
            StringBuilder buf = new StringBuilder();
            buf.append("[");
            buf.append(Pattern.quote(characterRange[0]));
            buf.append("-");
            buf.append(Pattern.quote(characterRange[1]));
            buf.append("]");
            return buf.toString();
        }
    } else if (partOfRegex.isAlternatives()) {
        IAST alternatives = (IAST) partOfRegex;
        StringBuilder pieces = new StringBuilder();
        for (int i = 1; i < alternatives.size(); i++) {
            String str = toRegexString(alternatives.get(i), abbreviatedPatterns, stringFunction, shortestLongest, groups, engine);
            if (str == null) {
                // `1` currently not supported in `2`.
                IOFunctions.printMessage(stringFunction.topHead(), "unsupported", F.list(alternatives.get(i), stringFunction.topHead()), engine);
                return null;
            }
            pieces.append(str);
            if (i < alternatives.size() - 1) {
                pieces.append('|');
            }
        }
        return pieces.toString();
    } else if (partOfRegex.isAST(S.Shortest, 2)) {
        String str = toRegexString(partOfRegex.first(), abbreviatedPatterns, stringFunction, REGEX_SHORTEST, groups, engine);
        return str;
    } else if (partOfRegex.isAST(S.Longest, 2)) {
        return toRegexString(partOfRegex.first(), abbreviatedPatterns, stringFunction, REGEX_LONGEST, groups, engine);
    } else if (partOfRegex.isBuiltInSymbol()) {
        int ordinal = ((IBuiltInSymbol) partOfRegex).ordinal();
        switch(ordinal) {
            case ID.NumberString:
                // better suitable for StringSplit?
                return "[0-9]{1,13}(\\.[0-9]+)?";
            // return "[-|+]?(\\d+(\\.\\d*)?|\\.\\d+)?";
            case ID.Whitespace:
                return "(?u)\\s+";
            case ID.DigitCharacter:
                return "\\d";
            case ID.WhitespaceCharacter:
                return "(?u)\\s";
            case ID.WordCharacter:
                return "(?u)[^\\W_]";
            case ID.StartOfLine:
                return "\\R";
            case ID.EndOfLine:
                return "$";
            case ID.StartOfString:
                return "\\A";
            case ID.EndOfString:
                return "\\Z";
            case ID.WordBoundary:
                return "\\b";
            case ID.LetterCharacter:
                return "(?u)[^\\W_0-9]";
            case ID.HexidecimalCharacter:
                return "[0-9a-fA-F]";
            default:
                // `1` currently not supported in `2`.
                IOFunctions.printMessage(stringFunction.topHead(), "unsupported", F.list(partOfRegex, stringFunction.topHead()), engine);
                return null;
        }
    }
    // `1` currently not supported in `2`.
    IOFunctions.printMessage(stringFunction.topHead(), "unsupported", F.list(partOfRegex, stringFunction.topHead()), engine);
    return null;
}
Also used : IPattern(org.matheclipse.core.interfaces.IPattern) RepeatedPattern(org.matheclipse.core.expression.RepeatedPattern) ISymbol(org.matheclipse.core.interfaces.ISymbol) PatternSequence(org.matheclipse.core.expression.PatternSequence) IStringX(org.matheclipse.core.interfaces.IStringX) IExpr(org.matheclipse.core.interfaces.IExpr) IAST(org.matheclipse.core.interfaces.IAST) PatternNested(org.matheclipse.core.expression.PatternNested)

Aggregations

PatternNested (org.matheclipse.core.expression.PatternNested)1 PatternSequence (org.matheclipse.core.expression.PatternSequence)1 RepeatedPattern (org.matheclipse.core.expression.RepeatedPattern)1 IAST (org.matheclipse.core.interfaces.IAST)1 IExpr (org.matheclipse.core.interfaces.IExpr)1 IPattern (org.matheclipse.core.interfaces.IPattern)1 IStringX (org.matheclipse.core.interfaces.IStringX)1 ISymbol (org.matheclipse.core.interfaces.ISymbol)1