Search in sources :

Example 1 with MockTokenFilter

use of org.apache.lucene.analysis.MockTokenFilter in project lucene-solr by apache.

the class TestIndexWriter method testStopwordsPosIncHole.

// LUCENE-3849
public void testStopwordsPosIncHole() throws Exception {
    Directory dir = newDirectory();
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer();
            TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET);
            return new TokenStreamComponents(tokenizer, stream);
        }
    };
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a);
    Document doc = new Document();
    doc.add(new TextField("body", "just a", Field.Store.NO));
    doc.add(new TextField("body", "test of gaps", Field.Store.NO));
    iw.addDocument(doc);
    IndexReader ir = iw.getReader();
    iw.close();
    IndexSearcher is = newSearcher(ir);
    PhraseQuery.Builder builder = new PhraseQuery.Builder();
    builder.add(new Term("body", "just"), 0);
    builder.add(new Term("body", "test"), 2);
    PhraseQuery pq = builder.build();
    // body:"just ? test"
    assertEquals(1, is.search(pq, 5).totalHits);
    ir.close();
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) MockTokenFilter(org.apache.lucene.analysis.MockTokenFilter) PhraseQuery(org.apache.lucene.search.PhraseQuery) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) TextField(org.apache.lucene.document.TextField) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) MMapDirectory(org.apache.lucene.store.MMapDirectory) Directory(org.apache.lucene.store.Directory) RAMDirectory(org.apache.lucene.store.RAMDirectory) FSDirectory(org.apache.lucene.store.FSDirectory) SimpleFSDirectory(org.apache.lucene.store.SimpleFSDirectory) NIOFSDirectory(org.apache.lucene.store.NIOFSDirectory)

Example 2 with MockTokenFilter

use of org.apache.lucene.analysis.MockTokenFilter in project lucene-solr by apache.

the class TestIndexWriter method testStopwordsPosIncHole2.

// LUCENE-3849
public void testStopwordsPosIncHole2() throws Exception {
    // use two stopfilters for testing here
    Directory dir = newDirectory();
    final Automaton secondSet = Automata.makeString("foobar");
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer();
            TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET);
            stream = new MockTokenFilter(stream, new CharacterRunAutomaton(secondSet));
            return new TokenStreamComponents(tokenizer, stream);
        }
    };
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a);
    Document doc = new Document();
    doc.add(new TextField("body", "just a foobar", Field.Store.NO));
    doc.add(new TextField("body", "test of gaps", Field.Store.NO));
    iw.addDocument(doc);
    IndexReader ir = iw.getReader();
    iw.close();
    IndexSearcher is = newSearcher(ir);
    PhraseQuery.Builder builder = new PhraseQuery.Builder();
    builder.add(new Term("body", "just"), 0);
    builder.add(new Term("body", "test"), 3);
    PhraseQuery pq = builder.build();
    // body:"just ? ? test"
    assertEquals(1, is.search(pq, 5).totalHits);
    ir.close();
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Automaton(org.apache.lucene.util.automaton.Automaton) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) MockTokenFilter(org.apache.lucene.analysis.MockTokenFilter) PhraseQuery(org.apache.lucene.search.PhraseQuery) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) TextField(org.apache.lucene.document.TextField) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) MMapDirectory(org.apache.lucene.store.MMapDirectory) Directory(org.apache.lucene.store.Directory) RAMDirectory(org.apache.lucene.store.RAMDirectory) FSDirectory(org.apache.lucene.store.FSDirectory) SimpleFSDirectory(org.apache.lucene.store.SimpleFSDirectory) NIOFSDirectory(org.apache.lucene.store.NIOFSDirectory)

Example 3 with MockTokenFilter

use of org.apache.lucene.analysis.MockTokenFilter in project lucene-solr by apache.

the class TestTermAutomatonQuery method testRandom.

public void testRandom() throws Exception {
    int numDocs = atLeast(100);
    Directory dir = newDirectory();
    // Adds occassional random synonyms:
    Analyzer analyzer = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true, 100);
            tokenizer.setEnableChecks(true);
            TokenFilter filt = new MockTokenFilter(tokenizer, MockTokenFilter.EMPTY_STOPSET);
            filt = new RandomSynonymFilter(filt);
            return new TokenStreamComponents(tokenizer, filt);
        }
    };
    IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
    for (int i = 0; i < numDocs; i++) {
        Document doc = new Document();
        int numTokens = atLeast(10);
        StringBuilder sb = new StringBuilder();
        for (int j = 0; j < numTokens; j++) {
            sb.append(' ');
            sb.append((char) (97 + random().nextInt(3)));
        }
        String contents = sb.toString();
        doc.add(newTextField("field", contents, Field.Store.NO));
        doc.add(new StoredField("id", "" + i));
        if (VERBOSE) {
            System.out.println("  doc " + i + " -> " + contents);
        }
        w.addDocument(doc);
    }
    IndexReader r = w.getReader();
    IndexSearcher s = newSearcher(r);
    // Used to match ANY using MultiPhraseQuery:
    Term[] allTerms = new Term[] { new Term("field", "a"), new Term("field", "b"), new Term("field", "c") };
    int numIters = atLeast(1000);
    for (int iter = 0; iter < numIters; iter++) {
        // Build the (finite, no any transitions) TermAutomatonQuery and
        // also the "equivalent" BooleanQuery and make sure they match the
        // same docs:
        BooleanQuery.Builder bq = new BooleanQuery.Builder();
        int count = TestUtil.nextInt(random(), 1, 5);
        Set<BytesRef> strings = new HashSet<>();
        for (int i = 0; i < count; i++) {
            StringBuilder sb = new StringBuilder();
            int numTokens = TestUtil.nextInt(random(), 1, 5);
            for (int j = 0; j < numTokens; j++) {
                if (j > 0 && j < numTokens - 1 && random().nextInt(5) == 3) {
                    sb.append('*');
                } else {
                    sb.append((char) (97 + random().nextInt(3)));
                }
            }
            String string = sb.toString();
            MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder();
            for (int j = 0; j < string.length(); j++) {
                if (string.charAt(j) == '*') {
                    mpqb.add(allTerms);
                } else {
                    mpqb.add(new Term("field", "" + string.charAt(j)));
                }
            }
            bq.add(mpqb.build(), BooleanClause.Occur.SHOULD);
            strings.add(new BytesRef(string));
        }
        List<BytesRef> stringsList = new ArrayList<>(strings);
        Collections.sort(stringsList);
        Automaton a = Automata.makeStringUnion(stringsList);
        // Translate automaton to query:
        TermAutomatonQuery q = new TermAutomatonQuery("field");
        int numStates = a.getNumStates();
        for (int i = 0; i < numStates; i++) {
            q.createState();
            q.setAccept(i, a.isAccept(i));
        }
        Transition t = new Transition();
        for (int i = 0; i < numStates; i++) {
            int transCount = a.initTransition(i, t);
            for (int j = 0; j < transCount; j++) {
                a.getNextTransition(t);
                for (int label = t.min; label <= t.max; label++) {
                    if ((char) label == '*') {
                        q.addAnyTransition(t.source, t.dest);
                    } else {
                        q.addTransition(t.source, t.dest, "" + (char) label);
                    }
                }
            }
        }
        q.finish();
        if (VERBOSE) {
            System.out.println("TEST: iter=" + iter);
            for (BytesRef string : stringsList) {
                System.out.println("  string: " + string.utf8ToString());
            }
            System.out.println(q.toDot());
        }
        Query q1 = q;
        Query q2 = bq.build();
        if (random().nextInt(5) == 1) {
            if (VERBOSE) {
                System.out.println("  use random filter");
            }
            RandomQuery filter = new RandomQuery(random().nextLong(), random().nextFloat());
            q1 = new BooleanQuery.Builder().add(q1, Occur.MUST).add(filter, Occur.FILTER).build();
            q2 = new BooleanQuery.Builder().add(q2, Occur.MUST).add(filter, Occur.FILTER).build();
        }
        TopDocs hits1 = s.search(q1, numDocs);
        TopDocs hits2 = s.search(q2, numDocs);
        Set<String> hits1Docs = toDocIDs(s, hits1);
        Set<String> hits2Docs = toDocIDs(s, hits2);
        try {
            assertEquals(hits2.totalHits, hits1.totalHits);
            assertEquals(hits2Docs, hits1Docs);
        } catch (AssertionError ae) {
            System.out.println("FAILED:");
            for (String id : hits1Docs) {
                if (hits2Docs.contains(id) == false) {
                    System.out.println(String.format(Locale.ROOT, "  id=%3s matched but should not have", id));
                }
            }
            for (String id : hits2Docs) {
                if (hits1Docs.contains(id) == false) {
                    System.out.println(String.format(Locale.ROOT, "  id=%3s did not match but should have", id));
                }
            }
            throw ae;
        }
    }
    IOUtils.close(w, r, dir, analyzer);
}
Also used : MockTokenFilter(org.apache.lucene.analysis.MockTokenFilter) ArrayList(java.util.ArrayList) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) StoredField(org.apache.lucene.document.StoredField) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) MockTokenFilter(org.apache.lucene.analysis.MockTokenFilter) TokenFilter(org.apache.lucene.analysis.TokenFilter) HashSet(java.util.HashSet) Automaton(org.apache.lucene.util.automaton.Automaton) Term(org.apache.lucene.index.Term) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) IndexReader(org.apache.lucene.index.IndexReader) Transition(org.apache.lucene.util.automaton.Transition) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 4 with MockTokenFilter

use of org.apache.lucene.analysis.MockTokenFilter in project lucene-solr by apache.

the class TestIDVersionPostingsFormat method testMissingPayload.

public void testMissingPayload() throws Exception {
    Directory dir = newDirectory();
    // MockAnalyzer minus maybePayload else it sometimes stuffs in an 8-byte payload!
    Analyzer a = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true, 100);
            tokenizer.setEnableChecks(true);
            MockTokenFilter filt = new MockTokenFilter(tokenizer, MockTokenFilter.EMPTY_STOPSET);
            return new TokenStreamComponents(tokenizer, filt);
        }
    };
    IndexWriterConfig iwc = newIndexWriterConfig(a);
    iwc.setCodec(TestUtil.alwaysPostingsFormat(new IDVersionPostingsFormat()));
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
    Document doc = new Document();
    doc.add(newTextField("id", "id", Field.Store.NO));
    expectThrows(IllegalArgumentException.class, () -> {
        w.addDocument(doc);
        w.commit();
    });
    w.close();
    dir.close();
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) MockTokenFilter(org.apache.lucene.analysis.MockTokenFilter) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Document(org.apache.lucene.document.Document) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Aggregations

Analyzer (org.apache.lucene.analysis.Analyzer)4 MockTokenFilter (org.apache.lucene.analysis.MockTokenFilter)4 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)4 Document (org.apache.lucene.document.Document)4 Directory (org.apache.lucene.store.Directory)4 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)3 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)2 TokenStream (org.apache.lucene.analysis.TokenStream)2 Tokenizer (org.apache.lucene.analysis.Tokenizer)2 TextField (org.apache.lucene.document.TextField)2 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)2 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)2 IndexSearcher (org.apache.lucene.search.IndexSearcher)2 PhraseQuery (org.apache.lucene.search.PhraseQuery)2 FSDirectory (org.apache.lucene.store.FSDirectory)2 MMapDirectory (org.apache.lucene.store.MMapDirectory)2 NIOFSDirectory (org.apache.lucene.store.NIOFSDirectory)2 RAMDirectory (org.apache.lucene.store.RAMDirectory)2 SimpleFSDirectory (org.apache.lucene.store.SimpleFSDirectory)2 Automaton (org.apache.lucene.util.automaton.Automaton)2