Search in sources :

Example 16 with CharacterRunAutomaton

use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project elasticsearch by elastic.

the class XContentMapValues method filter.

/**
     * Returns a function that filters a document map based on the given include and exclude rules.
     * @see #filter(Map, String[], String[]) for details
     */
public static Function<Map<String, ?>, Map<String, Object>> filter(String[] includes, String[] excludes) {
    CharacterRunAutomaton matchAllAutomaton = new CharacterRunAutomaton(Automata.makeAnyString());
    CharacterRunAutomaton include;
    if (includes == null || includes.length == 0) {
        include = matchAllAutomaton;
    } else {
        Automaton includeA = Regex.simpleMatchToAutomaton(includes);
        includeA = makeMatchDotsInFieldNames(includeA);
        include = new CharacterRunAutomaton(includeA);
    }
    Automaton excludeA;
    if (excludes == null || excludes.length == 0) {
        excludeA = Automata.makeEmpty();
    } else {
        excludeA = Regex.simpleMatchToAutomaton(excludes);
        excludeA = makeMatchDotsInFieldNames(excludeA);
    }
    CharacterRunAutomaton exclude = new CharacterRunAutomaton(excludeA);
    return (map) -> filter(map, include, 0, exclude, 0, matchAllAutomaton);
}
Also used : Arrays(java.util.Arrays) Numbers(org.elasticsearch.common.Numbers) Automaton(org.apache.lucene.util.automaton.Automaton) Booleans(org.elasticsearch.common.Booleans) HashMap(java.util.HashMap) Function(java.util.function.Function) Strings(org.elasticsearch.common.Strings) ArrayList(java.util.ArrayList) List(java.util.List) Operations(org.apache.lucene.util.automaton.Operations) TimeValue(org.elasticsearch.common.unit.TimeValue) Map(java.util.Map) Regex(org.elasticsearch.common.regex.Regex) ElasticsearchParseException(org.elasticsearch.ElasticsearchParseException) Automata(org.apache.lucene.util.automaton.Automata) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton)

Example 17 with CharacterRunAutomaton

use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.

the class TestSpanFirstQuery method testStartPositions.

public void testStartPositions() throws Exception {
    Directory dir = newDirectory();
    // mimic StopAnalyzer
    CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|a|of").toAutomaton());
    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
    Document doc = new Document();
    doc.add(newTextField("field", "the quick brown fox", Field.Store.NO));
    writer.addDocument(doc);
    Document doc2 = new Document();
    doc2.add(newTextField("field", "quick brown fox", Field.Store.NO));
    writer.addDocument(doc2);
    IndexReader reader = writer.getReader();
    IndexSearcher searcher = newSearcher(reader);
    // user queries on "starts-with quick"
    SpanQuery sfq = spanFirstQuery(spanTermQuery("field", "quick"), 1);
    assertEquals(1, searcher.search(sfq, 10).totalHits);
    // user queries on "starts-with the quick"
    SpanQuery include = spanFirstQuery(spanTermQuery("field", "quick"), 2);
    sfq = spanNotQuery(include, sfq);
    assertEquals(1, searcher.search(sfq, 10).totalHits);
    writer.close();
    reader.close();
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) RegExp(org.apache.lucene.util.automaton.RegExp) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) IndexReader(org.apache.lucene.index.IndexReader) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Document(org.apache.lucene.document.Document) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Example 18 with CharacterRunAutomaton

use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.

the class Dictionary method parseAffix.

/**
   * Parses a specific affix rule putting the result into the provided affix map
   * 
   * @param affixes Map where the result of the parsing will be put
   * @param header Header line of the affix rule
   * @param reader BufferedReader to read the content of the rule from
   * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
   *                         pattern
   * @param seenPatterns map from condition -&gt; index of patterns, for deduplication.
   * @throws IOException Can be thrown while reading the rule
   */
private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader, String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips) throws IOException, ParseException {
    BytesRefBuilder scratch = new BytesRefBuilder();
    StringBuilder sb = new StringBuilder();
    String[] args = header.split("\\s+");
    boolean crossProduct = args[2].equals("Y");
    boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;
    int numLines = Integer.parseInt(args[3]);
    affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
    ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);
    for (int i = 0; i < numLines; i++) {
        assert affixWriter.getPosition() == currentAffix << 3;
        String line = reader.readLine();
        String[] ruleArgs = line.split("\\s+");
        // condition is optional
        if (ruleArgs.length < 4) {
            throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber());
        }
        char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
        String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
        String affixArg = ruleArgs[3];
        char[] appendFlags = null;
        // first: parse continuation classes out of affix
        int flagSep = affixArg.lastIndexOf('/');
        if (flagSep != -1) {
            String flagPart = affixArg.substring(flagSep + 1);
            affixArg = affixArg.substring(0, flagSep);
            if (aliasCount > 0) {
                flagPart = getAliasValue(Integer.parseInt(flagPart));
            }
            appendFlags = flagParsingStrategy.parseFlags(flagPart);
            Arrays.sort(appendFlags);
            twoStageAffix = true;
        }
        // zero affix -> empty string
        if ("0".equals(affixArg)) {
            affixArg = "";
        }
        String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
        // at least the gascon affix file has this issue
        if (condition.startsWith("[") && condition.indexOf(']') == -1) {
            condition = condition + "]";
        }
        // "dash hasn't got special meaning" (we must escape it)
        if (condition.indexOf('-') >= 0) {
            condition = escapeDash(condition);
        }
        final String regex;
        if (".".equals(condition)) {
            // Zero condition is indicated by dot
            regex = ".*";
        } else if (condition.equals(strip)) {
            // TODO: optimize this better:
            regex = ".*";
        // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
        // but this is complicated...
        } else {
            regex = String.format(Locale.ROOT, conditionPattern, condition);
        }
        // deduplicate patterns
        Integer patternIndex = seenPatterns.get(regex);
        if (patternIndex == null) {
            patternIndex = patterns.size();
            if (patternIndex > Short.MAX_VALUE) {
                throw new UnsupportedOperationException("Too many patterns, please report this to dev@lucene.apache.org");
            }
            seenPatterns.put(regex, patternIndex);
            CharacterRunAutomaton pattern = new CharacterRunAutomaton(new RegExp(regex, RegExp.NONE).toAutomaton());
            patterns.add(pattern);
        }
        Integer stripOrd = seenStrips.get(strip);
        if (stripOrd == null) {
            stripOrd = seenStrips.size();
            seenStrips.put(strip, stripOrd);
            if (stripOrd > Character.MAX_VALUE) {
                throw new UnsupportedOperationException("Too many unique strips, please report this to dev@lucene.apache.org");
            }
        }
        if (appendFlags == null) {
            appendFlags = NOFLAGS;
        }
        encodeFlags(scratch, appendFlags);
        int appendFlagsOrd = flagLookup.add(scratch.get());
        if (appendFlagsOrd < 0) {
            // already exists in our hash
            appendFlagsOrd = (-appendFlagsOrd) - 1;
        } else if (appendFlagsOrd > Short.MAX_VALUE) {
            // this limit is probably flexible, but it's a good sanity check too
            throw new UnsupportedOperationException("Too many unique append flags, please report this to dev@lucene.apache.org");
        }
        affixWriter.writeShort((short) flag);
        affixWriter.writeShort((short) stripOrd.intValue());
        // encode crossProduct into patternIndex
        int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
        affixWriter.writeShort((short) patternOrd);
        affixWriter.writeShort((short) appendFlagsOrd);
        if (needsInputCleaning) {
            CharSequence cleaned = cleanInput(affixArg, sb);
            affixArg = cleaned.toString();
        }
        if (isSuffix) {
            affixArg = new StringBuilder(affixArg).reverse().toString();
        }
        List<Integer> list = affixes.get(affixArg);
        if (list == null) {
            list = new ArrayList<>();
            affixes.put(affixArg, list);
        }
        list.add(currentAffix);
        currentAffix++;
    }
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) RegExp(org.apache.lucene.util.automaton.RegExp) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) ByteArrayDataOutput(org.apache.lucene.store.ByteArrayDataOutput) ParseException(java.text.ParseException)

Example 19 with CharacterRunAutomaton

use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.

the class MinHashFilterTest method createMockShingleTokenizer.

private static Tokenizer createMockShingleTokenizer(int shingleSize, String shingles) {
    MockTokenizer tokenizer = new MockTokenizer(new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+([ \t\r\n]+[^ \t\r\n]+){" + (shingleSize - 1) + "}").toAutomaton()), true);
    tokenizer.setEnableChecks(true);
    if (shingles != null) {
        tokenizer.setReader(new StringReader(shingles));
    }
    return tokenizer;
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) RegExp(org.apache.lucene.util.automaton.RegExp) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) StringReader(java.io.StringReader)

Example 20 with CharacterRunAutomaton

use of org.apache.lucene.util.automaton.CharacterRunAutomaton in project lucene-solr by apache.

the class MemoryIndexOffsetStrategy method buildCombinedAutomaton.

/**
   * Build one {@link CharacterRunAutomaton} matching any term the query might match.
   */
private static CharacterRunAutomaton buildCombinedAutomaton(Predicate<String> fieldMatcher, BytesRef[] terms, CharacterRunAutomaton[] automata, PhraseHelper strictPhrases, Function<Query, Collection<Query>> multiTermQueryRewrite) {
    List<CharacterRunAutomaton> allAutomata = new ArrayList<>();
    if (terms.length > 0) {
        allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(terms))));
    }
    Collections.addAll(allAutomata, automata);
    for (SpanQuery spanQuery : strictPhrases.getSpanQueries()) {
        Collections.addAll(allAutomata, //true==lookInSpan
        MultiTermHighlighting.extractAutomata(spanQuery, fieldMatcher, true, multiTermQueryRewrite));
    }
    if (allAutomata.size() == 1) {
        return allAutomata.get(0);
    }
    // Return an aggregate CharacterRunAutomaton of others
    return new // the makeEmpty() is bogus; won't be used
    CharacterRunAutomaton(// the makeEmpty() is bogus; won't be used
    Automata.makeEmpty()) {

        @Override
        public boolean run(char[] chars, int offset, int length) {
            for (int i = 0; i < allAutomata.size(); i++) {
                // don't use foreach to avoid Iterator allocation
                if (allAutomata.get(i).run(chars, offset, length)) {
                    return true;
                }
            }
            return false;
        }
    };
}
Also used : CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) ArrayList(java.util.ArrayList) SpanQuery(org.apache.lucene.search.spans.SpanQuery)

Aggregations

CharacterRunAutomaton (org.apache.lucene.util.automaton.CharacterRunAutomaton)36 RegExp (org.apache.lucene.util.automaton.RegExp)15 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)10 ArrayList (java.util.ArrayList)7 Term (org.apache.lucene.index.Term)7 PhraseQuery (org.apache.lucene.search.PhraseQuery)6 Query (org.apache.lucene.search.Query)6 TermQuery (org.apache.lucene.search.TermQuery)6 List (java.util.List)5 Analyzer (org.apache.lucene.analysis.Analyzer)5 BooleanQuery (org.apache.lucene.search.BooleanQuery)5 BoostQuery (org.apache.lucene.search.BoostQuery)5 Automaton (org.apache.lucene.util.automaton.Automaton)5 HashMap (java.util.HashMap)4 Map (java.util.Map)4 Document (org.apache.lucene.document.Document)4 FuzzyQuery (org.apache.lucene.search.FuzzyQuery)4 PrefixQuery (org.apache.lucene.search.PrefixQuery)4 TermRangeQuery (org.apache.lucene.search.TermRangeQuery)4 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)3