Search in sources :

Example 16 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.

the class TestTermsEnum method testIntersectRegexp.

// LUCENE-7576
public void testIntersectRegexp() throws Exception {
    Directory d = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), d);
    Document doc = new Document();
    doc.add(newStringField("field", "foobar", Field.Store.NO));
    w.addDocument(doc);
    IndexReader r = w.getReader();
    Fields fields = MultiFields.getFields(r);
    CompiledAutomaton automaton = new CompiledAutomaton(new RegExp("do_not_match_anything").toAutomaton());
    Terms terms = fields.terms("field");
    String message = expectThrows(IllegalArgumentException.class, () -> {
        terms.intersect(automaton, null);
    }).getMessage();
    assertEquals("please use CompiledAutomaton.getTermsEnum instead", message);
    r.close();
    w.close();
    d.close();
}
Also used : RegExp(org.apache.lucene.util.automaton.RegExp) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) Document(org.apache.lucene.document.Document) Directory(org.apache.lucene.store.Directory)

Example 17 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.

the class TestSpanFirstQuery method testStartPositions.

public void testStartPositions() throws Exception {
    Directory dir = newDirectory();
    // mimic StopAnalyzer
    CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|a|of").toAutomaton());
    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet);
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, analyzer);
    Document doc = new Document();
    doc.add(newTextField("field", "the quick brown fox", Field.Store.NO));
    writer.addDocument(doc);
    Document doc2 = new Document();
    doc2.add(newTextField("field", "quick brown fox", Field.Store.NO));
    writer.addDocument(doc2);
    IndexReader reader = writer.getReader();
    IndexSearcher searcher = newSearcher(reader);
    // user queries on "starts-with quick"
    SpanQuery sfq = spanFirstQuery(spanTermQuery("field", "quick"), 1);
    assertEquals(1, searcher.search(sfq, 10).totalHits);
    // user queries on "starts-with the quick"
    SpanQuery include = spanFirstQuery(spanTermQuery("field", "quick"), 2);
    sfq = spanNotQuery(include, sfq);
    assertEquals(1, searcher.search(sfq, 10).totalHits);
    writer.close();
    reader.close();
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) RegExp(org.apache.lucene.util.automaton.RegExp) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) IndexReader(org.apache.lucene.index.IndexReader) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Document(org.apache.lucene.document.Document) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Example 18 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.

the class Dictionary method parseAffix.

/**
   * Parses a specific affix rule putting the result into the provided affix map
   * 
   * @param affixes Map where the result of the parsing will be put
   * @param header Header line of the affix rule
   * @param reader BufferedReader to read the content of the rule from
   * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
   *                         pattern
   * @param seenPatterns map from condition -> index of patterns, for deduplication.
   * @throws IOException Can be thrown while reading the rule
   */
private void parseAffix(TreeMap<String, List<Integer>> affixes, String header, LineNumberReader reader, String conditionPattern, Map<String, Integer> seenPatterns, Map<String, Integer> seenStrips) throws IOException, ParseException {
    BytesRefBuilder scratch = new BytesRefBuilder();
    StringBuilder sb = new StringBuilder();
    String[] args = header.split("\\s+");
    boolean crossProduct = args[2].equals("Y");
    boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;
    int numLines = Integer.parseInt(args[3]);
    affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
    ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);
    for (int i = 0; i < numLines; i++) {
        assert affixWriter.getPosition() == currentAffix << 3;
        String line = reader.readLine();
        String[] ruleArgs = line.split("\\s+");
        // condition is optional
        if (ruleArgs.length < 4) {
            throw new ParseException("The affix file contains a rule with less than four elements: " + line, reader.getLineNumber());
        }
        char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
        String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
        String affixArg = ruleArgs[3];
        char[] appendFlags = null;
        // first: parse continuation classes out of affix
        int flagSep = affixArg.lastIndexOf('/');
        if (flagSep != -1) {
            String flagPart = affixArg.substring(flagSep + 1);
            affixArg = affixArg.substring(0, flagSep);
            if (aliasCount > 0) {
                flagPart = getAliasValue(Integer.parseInt(flagPart));
            }
            appendFlags = flagParsingStrategy.parseFlags(flagPart);
            Arrays.sort(appendFlags);
            twoStageAffix = true;
        }
        // zero affix -> empty string
        if ("0".equals(affixArg)) {
            affixArg = "";
        }
        String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
        // at least the gascon affix file has this issue
        if (condition.startsWith("[") && condition.indexOf(']') == -1) {
            condition = condition + "]";
        }
        // "dash hasn't got special meaning" (we must escape it)
        if (condition.indexOf('-') >= 0) {
            condition = escapeDash(condition);
        }
        final String regex;
        if (".".equals(condition)) {
            // Zero condition is indicated by dot
            regex = ".*";
        } else if (condition.equals(strip)) {
            // TODO: optimize this better:
            regex = ".*";
        // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
        // but this is complicated...
        } else {
            regex = String.format(Locale.ROOT, conditionPattern, condition);
        }
        // deduplicate patterns
        Integer patternIndex = seenPatterns.get(regex);
        if (patternIndex == null) {
            patternIndex = patterns.size();
            if (patternIndex > Short.MAX_VALUE) {
                throw new UnsupportedOperationException("Too many patterns, please report this to dev@lucene.apache.org");
            }
            seenPatterns.put(regex, patternIndex);
            CharacterRunAutomaton pattern = new CharacterRunAutomaton(new RegExp(regex, RegExp.NONE).toAutomaton());
            patterns.add(pattern);
        }
        Integer stripOrd = seenStrips.get(strip);
        if (stripOrd == null) {
            stripOrd = seenStrips.size();
            seenStrips.put(strip, stripOrd);
            if (stripOrd > Character.MAX_VALUE) {
                throw new UnsupportedOperationException("Too many unique strips, please report this to dev@lucene.apache.org");
            }
        }
        if (appendFlags == null) {
            appendFlags = NOFLAGS;
        }
        encodeFlags(scratch, appendFlags);
        int appendFlagsOrd = flagLookup.add(scratch.get());
        if (appendFlagsOrd < 0) {
            // already exists in our hash
            appendFlagsOrd = (-appendFlagsOrd) - 1;
        } else if (appendFlagsOrd > Short.MAX_VALUE) {
            // this limit is probably flexible, but it's a good sanity check too
            throw new UnsupportedOperationException("Too many unique append flags, please report this to dev@lucene.apache.org");
        }
        affixWriter.writeShort((short) flag);
        affixWriter.writeShort((short) stripOrd.intValue());
        // encode crossProduct into patternIndex
        int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
        affixWriter.writeShort((short) patternOrd);
        affixWriter.writeShort((short) appendFlagsOrd);
        if (needsInputCleaning) {
            CharSequence cleaned = cleanInput(affixArg, sb);
            affixArg = cleaned.toString();
        }
        if (isSuffix) {
            affixArg = new StringBuilder(affixArg).reverse().toString();
        }
        List<Integer> list = affixes.get(affixArg);
        if (list == null) {
            list = new ArrayList<>();
            affixes.put(affixArg, list);
        }
        list.add(currentAffix);
        currentAffix++;
    }
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) RegExp(org.apache.lucene.util.automaton.RegExp) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) ByteArrayDataOutput(org.apache.lucene.store.ByteArrayDataOutput) ParseException(java.text.ParseException)

Example 19 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.

the class MinHashFilterTest method createMockShingleTokenizer.

private static Tokenizer createMockShingleTokenizer(int shingleSize, String shingles) {
    MockTokenizer tokenizer = new MockTokenizer(new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+([ \t\r\n]+[^ \t\r\n]+){" + (shingleSize - 1) + "}").toAutomaton()), true);
    tokenizer.setEnableChecks(true);
    if (shingles != null) {
        tokenizer.setReader(new StringReader(shingles));
    }
    return tokenizer;
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) RegExp(org.apache.lucene.util.automaton.RegExp) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) StringReader(java.io.StringReader)

Example 20 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.

the class TestBlockPostingsFormat3 method assertTerms.

// following code is almost an exact dup of code from TestDuelingCodecs: sorry!
public void assertTerms(Terms leftTerms, Terms rightTerms, boolean deep) throws Exception {
    if (leftTerms == null || rightTerms == null) {
        assertNull(leftTerms);
        assertNull(rightTerms);
        return;
    }
    assertTermsStatistics(leftTerms, rightTerms);
    // NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be different
    boolean bothHavePositions = leftTerms.hasPositions() && rightTerms.hasPositions();
    TermsEnum leftTermsEnum = leftTerms.iterator();
    TermsEnum rightTermsEnum = rightTerms.iterator();
    assertTermsEnum(leftTermsEnum, rightTermsEnum, true, bothHavePositions);
    assertTermsSeeking(leftTerms, rightTerms);
    if (deep) {
        int numIntersections = atLeast(3);
        for (int i = 0; i < numIntersections; i++) {
            String re = AutomatonTestUtil.randomRegexp(random());
            CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton());
            if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
                // TODO: test start term too
                TermsEnum leftIntersection = leftTerms.intersect(automaton, null);
                TermsEnum rightIntersection = rightTerms.intersect(automaton, null);
                assertTermsEnum(leftIntersection, rightIntersection, rarely(), bothHavePositions);
            }
        }
    }
}
Also used : RegExp(org.apache.lucene.util.automaton.RegExp) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) TermsEnum(org.apache.lucene.index.TermsEnum)

Aggregations

RegExp (org.apache.lucene.util.automaton.RegExp)30 CharacterRunAutomaton (org.apache.lucene.util.automaton.CharacterRunAutomaton)15 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)11 Document (org.apache.lucene.document.Document)9 Directory (org.apache.lucene.store.Directory)9 BytesRef (org.apache.lucene.util.BytesRef)9 CompiledAutomaton (org.apache.lucene.util.automaton.CompiledAutomaton)9 Analyzer (org.apache.lucene.analysis.Analyzer)5 Automaton (org.apache.lucene.util.automaton.Automaton)5 Term (org.apache.lucene.index.Term)4 IndexReader (org.apache.lucene.index.IndexReader)3 PhraseQuery (org.apache.lucene.search.PhraseQuery)3 TermQuery (org.apache.lucene.search.TermQuery)3 StringReader (java.io.StringReader)2 TreeSet (java.util.TreeSet)2 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)2 IndexWriter (org.apache.lucene.index.IndexWriter)2 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)2 TermsEnum (org.apache.lucene.index.TermsEnum)2 CommonQueryParserConfiguration (org.apache.lucene.queryparser.flexible.standard.CommonQueryParserConfiguration)2