Examples with RegExp - org.apache.lucene.util.automaton.RegExp

Example 6 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.

the class FastVectorHighlighterTest method matchedFieldsTestCase.

private void matchedFieldsTestCase(boolean useMatchedFields, boolean fieldMatch, String fieldValue, String expected, Query... queryClauses) throws IOException {
    Document doc = new Document();
    FieldType stored = new FieldType(TextField.TYPE_STORED);
    stored.setStoreTermVectorOffsets(true);
    stored.setStoreTermVectorPositions(true);
    stored.setStoreTermVectors(true);
    stored.freeze();
    FieldType matched = new FieldType(TextField.TYPE_NOT_STORED);
    matched.setStoreTermVectorOffsets(true);
    matched.setStoreTermVectorPositions(true);
    matched.setStoreTermVectors(true);
    matched.freeze();
    // Whitespace tokenized with English stop words
    doc.add(new Field("field", fieldValue, stored));
    // Whitespace tokenized without stop words
    doc.add(new Field("field_exact", fieldValue, matched));
    // Whitespace tokenized without toLower
    doc.add(new Field("field_super_exact", fieldValue, matched));
    // Each letter is a token
    doc.add(new Field("field_characters", fieldValue, matched));
    // Every three letters is a token
    doc.add(new Field("field_tripples", fieldValue, matched));
    doc.add(new Field("field_sliced", // Sliced at 10 chars then analyzed just like field
    fieldValue.substring(// Sliced at 10 chars then analyzed just like field
    0, Math.min(fieldValue.length() - 1, 10)), matched));
    doc.add(new Field("field_der_red", new // Hacky field containing "der" and "red" at pos = 0
    CannedTokenStream(token("der", 1, 0, 3), token("red", 0, 0, 3)), matched));
    final Map<String, Analyzer> fieldAnalyzers = new TreeMap<>();
    fieldAnalyzers.put("field", new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET));
    fieldAnalyzers.put("field_exact", new MockAnalyzer(random()));
    fieldAnalyzers.put("field_super_exact", new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
    fieldAnalyzers.put("field_characters", new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp(".").toAutomaton()), true));
    fieldAnalyzers.put("field_tripples", new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp("...").toAutomaton()), true));
    fieldAnalyzers.put("field_sliced", fieldAnalyzers.get("field"));
    // This is required even though we provide a token stream
    fieldAnalyzers.put("field_der_red", fieldAnalyzers.get("field"));
    Analyzer analyzer = new DelegatingAnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) {

        public Analyzer getWrappedAnalyzer(String fieldName) {
            return fieldAnalyzers.get(fieldName);
        }
    };
    Directory dir = newDirectory();
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer));
    writer.addDocument(doc);
    FastVectorHighlighter highlighter = new FastVectorHighlighter();
    FragListBuilder fragListBuilder = new SimpleFragListBuilder();
    FragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder();
    IndexReader reader = DirectoryReader.open(writer);
    String[] preTags = new String[] { "<b>" };
    String[] postTags = new String[] { "</b>" };
    Encoder encoder = new DefaultEncoder();
    int docId = 0;
    BooleanQuery.Builder query = new BooleanQuery.Builder();
    for (Query clause : queryClauses) {
        query.add(clause, Occur.MUST);
    }
    FieldQuery fieldQuery = new FieldQuery(query.build(), reader, true, fieldMatch);
    String[] bestFragments;
    if (useMatchedFields) {
        Set<String> matchedFields = new HashSet<>();
        matchedFields.add("field");
        matchedFields.add("field_exact");
        matchedFields.add("field_super_exact");
        matchedFields.add("field_characters");
        matchedFields.add("field_tripples");
        matchedFields.add("field_sliced");
        matchedFields.add("field_der_red");
        bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", matchedFields, 25, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
    } else {
        bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 25, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
    }
    assertEquals(expected, bestFragments[0]);
    reader.close();
    writer.close();
    dir.close();
}

Also used : BooleanQuery(org.apache.lucene.search.BooleanQuery) Query(org.apache.lucene.search.Query) CommonTermsQuery(org.apache.lucene.queries.CommonTermsQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) CustomScoreQuery(org.apache.lucene.queries.CustomScoreQuery) TermQuery(org.apache.lucene.search.TermQuery) SynonymQuery(org.apache.lucene.search.SynonymQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) BoostQuery(org.apache.lucene.search.BoostQuery) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) Document(org.apache.lucene.document.Document) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) StoredField(org.apache.lucene.document.StoredField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) DefaultEncoder(org.apache.lucene.search.highlight.DefaultEncoder) Encoder(org.apache.lucene.search.highlight.Encoder) DefaultEncoder(org.apache.lucene.search.highlight.DefaultEncoder) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Directory(org.apache.lucene.store.Directory) HashSet(java.util.HashSet) RegExp(org.apache.lucene.util.automaton.RegExp) TreeMap(java.util.TreeMap) FieldType(org.apache.lucene.document.FieldType) DelegatingAnalyzerWrapper(org.apache.lucene.analysis.DelegatingAnalyzerWrapper) IndexWriter(org.apache.lucene.index.IndexWriter) IndexReader(org.apache.lucene.index.IndexReader)

Example 7 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.

the class TestQPHelper method testStopwords.

public void testStopwords() throws Exception {
    StandardQueryParser qp = new StandardQueryParser();
    CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
    qp.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));
    Query result = qp.parse("a:the OR a:foo", "a");
    assertNotNull("result is null and it shouldn't be", result);
    assertTrue("result is not a MatchNoDocsQuery", result instanceof MatchNoDocsQuery);
    result = qp.parse("a:woo OR a:the", "a");
    assertNotNull("result is null and it shouldn't be", result);
    assertTrue("result is not a TermQuery", result instanceof TermQuery);
    result = qp.parse("(fieldX:xxxxx OR fieldy:xxxxxxxx)^2 AND (fieldx:the OR fieldy:foo)", "a");
    Query expected = new BooleanQuery.Builder().add(new TermQuery(new Term("fieldX", "xxxxx")), Occur.SHOULD).add(new TermQuery(new Term("fieldy", "xxxxxxxx")), Occur.SHOULD).build();
    expected = new BoostQuery(expected, 2f);
    assertEquals(expected, result);
}

Also used : MultiTermQuery(org.apache.lucene.search.MultiTermQuery) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Query(org.apache.lucene.search.Query) MatchNoDocsQuery(org.apache.lucene.search.MatchNoDocsQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) RegexpQuery(org.apache.lucene.search.RegexpQuery) MultiTermQuery(org.apache.lucene.search.MultiTermQuery) PrefixQuery(org.apache.lucene.search.PrefixQuery) FuzzyQuery(org.apache.lucene.search.FuzzyQuery) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) WildcardQuery(org.apache.lucene.search.WildcardQuery) MultiPhraseQuery(org.apache.lucene.search.MultiPhraseQuery) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) BoostQuery(org.apache.lucene.search.BoostQuery) TermRangeQuery(org.apache.lucene.search.TermRangeQuery) RegExp(org.apache.lucene.util.automaton.RegExp) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) MatchNoDocsQuery(org.apache.lucene.search.MatchNoDocsQuery) Term(org.apache.lucene.index.Term) BoostQuery(org.apache.lucene.search.BoostQuery)

Example 8 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.

the class TestFSTs method testRealTerms.

// Build FST for all unique terms in the test line docs
// file, up until a doc limit
public void testRealTerms() throws Exception {
    final LineFileDocs docs = new LineFileDocs(random());
    final int numDocs = TEST_NIGHTLY ? atLeast(1000) : atLeast(100);
    MockAnalyzer analyzer = new MockAnalyzer(random());
    analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
    final IndexWriterConfig conf = newIndexWriterConfig(analyzer).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64);
    final Path tempDir = createTempDir("fstlines");
    final Directory dir = newFSDirectory(tempDir);
    final IndexWriter writer = new IndexWriter(dir, conf);
    Document doc;
    int docCount = 0;
    while ((doc = docs.nextDoc()) != null && docCount < numDocs) {
        writer.addDocument(doc);
        docCount++;
    }
    IndexReader r = DirectoryReader.open(writer);
    writer.close();
    final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
    boolean storeOrd = random().nextBoolean();
    if (VERBOSE) {
        if (storeOrd) {
            System.out.println("FST stores ord");
        } else {
            System.out.println("FST stores docFreq");
        }
    }
    Terms terms = MultiFields.getTerms(r, "body");
    if (terms != null) {
        final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
        final TermsEnum termsEnum = terms.iterator();
        if (VERBOSE) {
            System.out.println("TEST: got termsEnum=" + termsEnum);
        }
        BytesRef term;
        int ord = 0;
        Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
        final TermsEnum termsEnum2 = terms.intersect(new CompiledAutomaton(automaton, false, false), null);
        while ((term = termsEnum.next()) != null) {
            BytesRef term2 = termsEnum2.next();
            assertNotNull(term2);
            assertEquals(term, term2);
            assertEquals(termsEnum.docFreq(), termsEnum2.docFreq());
            assertEquals(termsEnum.totalTermFreq(), termsEnum2.totalTermFreq());
            if (ord == 0) {
                try {
                    termsEnum.ord();
                } catch (UnsupportedOperationException uoe) {
                    if (VERBOSE) {
                        System.out.println("TEST: codec doesn't support ord; FST stores docFreq");
                    }
                    storeOrd = false;
                }
            }
            final int output;
            if (storeOrd) {
                output = ord;
            } else {
                output = termsEnum.docFreq();
            }
            builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
            ord++;
            if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
                System.out.println(ord + " terms...");
            }
        }
        FST<Long> fst = builder.finish();
        if (VERBOSE) {
            System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes");
        }
        if (ord > 0) {
            final Random random = new Random(random().nextLong());
            // Now confirm BytesRefFSTEnum and TermsEnum act the
            // same:
            final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
            int num = atLeast(1000);
            for (int iter = 0; iter < num; iter++) {
                final BytesRef randomTerm = new BytesRef(getRandomString(random));
                if (VERBOSE) {
                    System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm);
                }
                final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm);
                final InputOutput<Long> fstSeekResult = fstEnum.seekCeil(randomTerm);
                if (seekResult == TermsEnum.SeekStatus.END) {
                    assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult);
                } else {
                    assertSame(termsEnum, fstEnum, storeOrd);
                    for (int nextIter = 0; nextIter < 10; nextIter++) {
                        if (VERBOSE) {
                            System.out.println("TEST: next");
                            if (storeOrd) {
                                System.out.println("  ord=" + termsEnum.ord());
                            }
                        }
                        if (termsEnum.next() != null) {
                            if (VERBOSE) {
                                System.out.println("  term=" + termsEnum.term().utf8ToString());
                            }
                            assertNotNull(fstEnum.next());
                            assertSame(termsEnum, fstEnum, storeOrd);
                        } else {
                            if (VERBOSE) {
                                System.out.println("  end!");
                            }
                            BytesRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
                            if (nextResult != null) {
                                System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output));
                                fail();
                            }
                            break;
                        }
                    }
                }
            }
        }
    }
    r.close();
    dir.close();
}

Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) Document(org.apache.lucene.document.Document) TermsEnum(org.apache.lucene.index.TermsEnum) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Random(java.util.Random) BytesRef(org.apache.lucene.util.BytesRef) LineFileDocs(org.apache.lucene.util.LineFileDocs) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Path(java.nio.file.Path) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton) RegExp(org.apache.lucene.util.automaton.RegExp) Terms(org.apache.lucene.index.Terms) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) IndexWriter(org.apache.lucene.index.IndexWriter) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexReader(org.apache.lucene.index.IndexReader) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 9 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.

the class QueryParserTestBase method testPhraseQueryPositionIncrements.

public void testPhraseQueryPositionIncrements() throws Exception {
    CharacterRunAutomaton stopStopList = new CharacterRunAutomaton(new RegExp("[sS][tT][oO][pP]").toAutomaton());
    CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList));
    qp.setEnablePositionIncrements(true);
    PhraseQuery.Builder phraseQuery = new PhraseQuery.Builder();
    phraseQuery.add(new Term("field", "1"));
    phraseQuery.add(new Term("field", "2"), 2);
    assertEquals(phraseQuery.build(), getQuery("\"1 stop 2\"", qp));
}

Also used : RegExp(org.apache.lucene.util.automaton.RegExp) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) Term(org.apache.lucene.index.Term) CommonQueryParserConfiguration(org.apache.lucene.queryparser.flexible.standard.CommonQueryParserConfiguration)

Example 10 with RegExp

use of org.apache.lucene.util.automaton.RegExp in project lucene-solr by apache.

the class TestTermsEnum method testIntersectEmptyString.

public void testIntersectEmptyString() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
    iwc.setMergePolicy(new LogDocMergePolicy());
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
    Document doc = new Document();
    doc.add(newStringField("field", "", Field.Store.NO));
    doc.add(newStringField("field", "abc", Field.Store.NO));
    w.addDocument(doc);
    doc = new Document();
    // add empty string to both documents, so that singletonDocID == -1.
    // For a FST-based term dict, we'll expect to see the first arc is 
    // flaged with HAS_FINAL_OUTPUT
    doc.add(newStringField("field", "abc", Field.Store.NO));
    doc.add(newStringField("field", "", Field.Store.NO));
    w.addDocument(doc);
    w.forceMerge(1);
    DirectoryReader r = w.getReader();
    w.close();
    LeafReader sub = getOnlyLeafReader(r);
    Terms terms = sub.fields().terms("field");
    // accept ALL
    Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
    CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);
    TermsEnum te = terms.intersect(ca, null);
    PostingsEnum de;
    assertEquals("", te.next().utf8ToString());
    de = te.postings(null, PostingsEnum.NONE);
    assertEquals(0, de.nextDoc());
    assertEquals(1, de.nextDoc());
    assertEquals("abc", te.next().utf8ToString());
    de = te.postings(null, PostingsEnum.NONE);
    assertEquals(0, de.nextDoc());
    assertEquals(1, de.nextDoc());
    assertNull(te.next());
    // pass empty string
    te = terms.intersect(ca, new BytesRef(""));
    assertEquals("abc", te.next().utf8ToString());
    de = te.postings(null, PostingsEnum.NONE);
    assertEquals(0, de.nextDoc());
    assertEquals(1, de.nextDoc());
    assertNull(te.next());
    r.close();
    dir.close();
}

Also used : Automaton(org.apache.lucene.util.automaton.Automaton) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) RegExp(org.apache.lucene.util.automaton.RegExp) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) Document(org.apache.lucene.document.Document) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Aggregations

RegExp (org.apache.lucene.util.automaton.RegExp)30 CharacterRunAutomaton (org.apache.lucene.util.automaton.CharacterRunAutomaton)15 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)11 Document (org.apache.lucene.document.Document)9 Directory (org.apache.lucene.store.Directory)9 BytesRef (org.apache.lucene.util.BytesRef)9 CompiledAutomaton (org.apache.lucene.util.automaton.CompiledAutomaton)9 Analyzer (org.apache.lucene.analysis.Analyzer)5 Automaton (org.apache.lucene.util.automaton.Automaton)5 Term (org.apache.lucene.index.Term)4 IndexReader (org.apache.lucene.index.IndexReader)3 PhraseQuery (org.apache.lucene.search.PhraseQuery)3 TermQuery (org.apache.lucene.search.TermQuery)3 StringReader (java.io.StringReader)2 TreeSet (java.util.TreeSet)2 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)2 IndexWriter (org.apache.lucene.index.IndexWriter)2 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)2 TermsEnum (org.apache.lucene.index.TermsEnum)2 CommonQueryParserConfiguration (org.apache.lucene.queryparser.flexible.standard.CommonQueryParserConfiguration)2