Search in sources :

Example 91 with MockTokenizer

use of org.apache.lucene.analysis.MockTokenizer in project lucene-solr by apache.

the class TestSynonymMapFilter method testRandom.

public void testRandom() throws Exception {
    final int alphabetSize = TestUtil.nextInt(random(), 2, 7);
    final int docLen = atLeast(3000);
    //final int docLen = 50;
    final String document = getRandomString('a', alphabetSize, docLen);
    if (VERBOSE) {
        System.out.println("TEST: doc=" + document);
    }
    final int numSyn = atLeast(5);
    //final int numSyn = 2;
    final Map<String, OneSyn> synMap = new HashMap<>();
    final List<OneSyn> syns = new ArrayList<>();
    final boolean dedup = random().nextBoolean();
    if (VERBOSE) {
        System.out.println("  dedup=" + dedup);
    }
    b = new SynonymMap.Builder(dedup);
    for (int synIDX = 0; synIDX < numSyn; synIDX++) {
        final String synIn = getRandomString('a', alphabetSize, TestUtil.nextInt(random(), 1, 5)).trim();
        OneSyn s = synMap.get(synIn);
        if (s == null) {
            s = new OneSyn();
            s.in = synIn;
            syns.add(s);
            s.out = new ArrayList<>();
            synMap.put(synIn, s);
            s.keepOrig = random().nextBoolean();
        }
        final String synOut = getRandomString('0', 10, TestUtil.nextInt(random(), 1, 5)).trim();
        s.out.add(synOut);
        add(synIn, synOut, s.keepOrig);
        if (VERBOSE) {
            System.out.println("  syns[" + synIDX + "] = " + s.in + " -> " + s.out + " keepOrig=" + s.keepOrig);
        }
    }
    tokensIn = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    tokensIn.setReader(new StringReader("a"));
    tokensIn.reset();
    assertTrue(tokensIn.incrementToken());
    assertFalse(tokensIn.incrementToken());
    tokensIn.end();
    tokensIn.close();
    tokensOut = new SynonymFilter(tokensIn, b.build(), true);
    termAtt = tokensOut.addAttribute(CharTermAttribute.class);
    posIncrAtt = tokensOut.addAttribute(PositionIncrementAttribute.class);
    posLenAtt = tokensOut.addAttribute(PositionLengthAttribute.class);
    offsetAtt = tokensOut.addAttribute(OffsetAttribute.class);
    if (dedup) {
        pruneDups(syns);
    }
    final String expected = slowSynMatcher(document, syns, 5);
    if (VERBOSE) {
        System.out.println("TEST: expected=" + expected);
    }
    verify(document, expected);
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) StringReader(java.io.StringReader)

Example 92 with MockTokenizer

use of org.apache.lucene.analysis.MockTokenizer in project lucene-solr by apache.

the class TestDocument method testInvalidFields.

// LUCENE-3616
public void testInvalidFields() {
    expectThrows(IllegalArgumentException.class, () -> {
        Tokenizer tok = new MockTokenizer();
        tok.setReader(new StringReader(""));
        new Field("foo", tok, StringField.TYPE_STORED);
    });
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) IndexableField(org.apache.lucene.index.IndexableField) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer)

Example 93 with MockTokenizer

use of org.apache.lucene.analysis.MockTokenizer in project lucene-solr by apache.

the class TestIndexWriter method testStopwordsPosIncHole.

// LUCENE-3849
public void testStopwordsPosIncHole() throws Exception {
    Directory dir = newDirectory();
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer();
            TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET);
            return new TokenStreamComponents(tokenizer, stream);
        }
    };
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a);
    Document doc = new Document();
    doc.add(new TextField("body", "just a", Field.Store.NO));
    doc.add(new TextField("body", "test of gaps", Field.Store.NO));
    iw.addDocument(doc);
    IndexReader ir = iw.getReader();
    iw.close();
    IndexSearcher is = newSearcher(ir);
    PhraseQuery.Builder builder = new PhraseQuery.Builder();
    builder.add(new Term("body", "just"), 0);
    builder.add(new Term("body", "test"), 2);
    PhraseQuery pq = builder.build();
    // body:"just ? test"
    assertEquals(1, is.search(pq, 5).totalHits);
    ir.close();
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) MockTokenFilter(org.apache.lucene.analysis.MockTokenFilter) PhraseQuery(org.apache.lucene.search.PhraseQuery) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) TextField(org.apache.lucene.document.TextField) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) MMapDirectory(org.apache.lucene.store.MMapDirectory) Directory(org.apache.lucene.store.Directory) RAMDirectory(org.apache.lucene.store.RAMDirectory) FSDirectory(org.apache.lucene.store.FSDirectory) SimpleFSDirectory(org.apache.lucene.store.SimpleFSDirectory) NIOFSDirectory(org.apache.lucene.store.NIOFSDirectory)

Example 94 with MockTokenizer

use of org.apache.lucene.analysis.MockTokenizer in project lucene-solr by apache.

the class TestIndexWriter method testIndexStoreCombos.

public void testIndexStoreCombos() throws Exception {
    Directory dir = newDirectory();
    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
    byte[] b = new byte[50];
    for (int i = 0; i < 50; i++) b[i] = (byte) (i + 77);
    Document doc = new Document();
    FieldType customType = new FieldType(StoredField.TYPE);
    customType.setTokenized(true);
    Field f = new Field("binary", b, 10, 17, customType);
    // TODO: this is evil, changing the type after creating the field:
    customType.setIndexOptions(IndexOptions.DOCS);
    final MockTokenizer doc1field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    doc1field1.setReader(new StringReader("doc1field1"));
    f.setTokenStream(doc1field1);
    FieldType customType2 = new FieldType(TextField.TYPE_STORED);
    Field f2 = newField("string", "value", customType2);
    final MockTokenizer doc1field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    doc1field2.setReader(new StringReader("doc1field2"));
    f2.setTokenStream(doc1field2);
    doc.add(f);
    doc.add(f2);
    w.addDocument(doc);
    // add 2 docs to test in-memory merging
    final MockTokenizer doc2field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    doc2field1.setReader(new StringReader("doc2field1"));
    f.setTokenStream(doc2field1);
    final MockTokenizer doc2field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    doc2field2.setReader(new StringReader("doc2field2"));
    f2.setTokenStream(doc2field2);
    w.addDocument(doc);
    // force segment flush so we can force a segment merge with doc3 later.
    w.commit();
    final MockTokenizer doc3field1 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    doc3field1.setReader(new StringReader("doc3field1"));
    f.setTokenStream(doc3field1);
    final MockTokenizer doc3field2 = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    doc3field2.setReader(new StringReader("doc3field2"));
    f2.setTokenStream(doc3field2);
    w.addDocument(doc);
    w.commit();
    // force segment merge.
    w.forceMerge(1);
    w.close();
    IndexReader ir = DirectoryReader.open(dir);
    Document doc2 = ir.document(0);
    IndexableField f3 = doc2.getField("binary");
    b = f3.binaryValue().bytes;
    assertTrue(b != null);
    assertEquals(17, b.length, 17);
    assertEquals(87, b[0]);
    assertTrue(ir.document(0).getField("binary").binaryValue() != null);
    assertTrue(ir.document(1).getField("binary").binaryValue() != null);
    assertTrue(ir.document(2).getField("binary").binaryValue() != null);
    assertEquals("value", ir.document(0).get("string"));
    assertEquals("value", ir.document(1).get("string"));
    assertEquals("value", ir.document(2).get("string"));
    // test that the terms were indexed.
    assertTrue(TestUtil.docs(random(), ir, "binary", new BytesRef("doc1field1"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertTrue(TestUtil.docs(random(), ir, "binary", new BytesRef("doc2field1"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertTrue(TestUtil.docs(random(), ir, "binary", new BytesRef("doc3field1"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertTrue(TestUtil.docs(random(), ir, "string", new BytesRef("doc1field2"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertTrue(TestUtil.docs(random(), ir, "string", new BytesRef("doc2field2"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertTrue(TestUtil.docs(random(), ir, "string", new BytesRef("doc3field2"), null, PostingsEnum.NONE).nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    ir.close();
    dir.close();
}
Also used : Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) StoredField(org.apache.lucene.document.StoredField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) BinaryDocValuesField(org.apache.lucene.document.BinaryDocValuesField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) StringField(org.apache.lucene.document.StringField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) StringReader(java.io.StringReader) BytesRef(org.apache.lucene.util.BytesRef) MMapDirectory(org.apache.lucene.store.MMapDirectory) Directory(org.apache.lucene.store.Directory) RAMDirectory(org.apache.lucene.store.RAMDirectory) FSDirectory(org.apache.lucene.store.FSDirectory) SimpleFSDirectory(org.apache.lucene.store.SimpleFSDirectory) NIOFSDirectory(org.apache.lucene.store.NIOFSDirectory)

Example 95 with MockTokenizer

use of org.apache.lucene.analysis.MockTokenizer in project lucene-solr by apache.

the class TestGermanStemFilter method testKeyword.

public void testKeyword() throws IOException {
    final CharArraySet exclusionSet = new CharArraySet(asSet("sängerinnen"), false);
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
            return new TokenStreamComponents(source, new GermanStemFilter(sink));
        }
    };
    checkOneTerm(a, "sängerinnen", "sängerinnen");
    a.close();
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) CharArraySet(org.apache.lucene.analysis.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) SetKeywordMarkerFilter(org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter) Analyzer(org.apache.lucene.analysis.Analyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Aggregations

MockTokenizer (org.apache.lucene.analysis.MockTokenizer)280 Tokenizer (org.apache.lucene.analysis.Tokenizer)204 Analyzer (org.apache.lucene.analysis.Analyzer)161 StringReader (java.io.StringReader)118 TokenStream (org.apache.lucene.analysis.TokenStream)116 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)106 Reader (java.io.Reader)59 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)54 CharArraySet (org.apache.lucene.analysis.CharArraySet)44 Directory (org.apache.lucene.store.Directory)36 Document (org.apache.lucene.document.Document)31 BytesRef (org.apache.lucene.util.BytesRef)25 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)21 TextField (org.apache.lucene.document.TextField)20 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)18 Field (org.apache.lucene.document.Field)17 FieldType (org.apache.lucene.document.FieldType)14 StringField (org.apache.lucene.document.StringField)11 Input (org.apache.lucene.search.suggest.Input)11 InputArrayIterator (org.apache.lucene.search.suggest.InputArrayIterator)11