Search in sources :

Example 61 with CannedTokenStream

use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.

the class TestRemoveDuplicatesTokenFilterFactory method testDups.

public void testDups(final String expected, final Token... tokens) throws Exception {
    TokenStream stream = new CannedTokenStream(tokens);
    stream = tokenFilterFactory("RemoveDuplicates").create(stream);
    assertTokenStreamContents(stream, expected.split("\\s"));
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream)

Example 62 with CannedTokenStream

use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.

the class TestFixBrokenOffsetsFilter method testBogusTermVectors.

public void testBogusTermVectors() throws IOException {
    Directory dir = newDirectory();
    IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
    Document doc = new Document();
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorOffsets(true);
    Field field = new Field("foo", "", ft);
    field.setTokenStream(new FixBrokenOffsetsFilter(new CannedTokenStream(new Token("bar", 5, 10), new Token("bar", 1, 4))));
    doc.add(field);
    iw.addDocument(doc);
    iw.close();
    dir.close();
}
Also used : Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) IndexWriter(org.apache.lucene.index.IndexWriter) Token(org.apache.lucene.analysis.Token) Document(org.apache.lucene.document.Document) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Directory(org.apache.lucene.store.Directory) FieldType(org.apache.lucene.document.FieldType)

Example 63 with CannedTokenStream

use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.

the class TestPostingsOffsets method testBasic.

public void testBasic() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
    Document doc = new Document();
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    if (random().nextBoolean()) {
        ft.setStoreTermVectors(true);
        ft.setStoreTermVectorPositions(random().nextBoolean());
        ft.setStoreTermVectorOffsets(random().nextBoolean());
    }
    Token[] tokens = new Token[] { makeToken("a", 1, 0, 6), makeToken("b", 1, 8, 9), makeToken("a", 1, 9, 17), makeToken("c", 1, 19, 50) };
    doc.add(new Field("content", new CannedTokenStream(tokens), ft));
    w.addDocument(doc);
    IndexReader r = w.getReader();
    w.close();
    PostingsEnum dp = MultiFields.getTermPositionsEnum(r, "content", new BytesRef("a"));
    assertNotNull(dp);
    assertEquals(0, dp.nextDoc());
    assertEquals(2, dp.freq());
    assertEquals(0, dp.nextPosition());
    assertEquals(0, dp.startOffset());
    assertEquals(6, dp.endOffset());
    assertEquals(2, dp.nextPosition());
    assertEquals(9, dp.startOffset());
    assertEquals(17, dp.endOffset());
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());
    dp = MultiFields.getTermPositionsEnum(r, "content", new BytesRef("b"));
    assertNotNull(dp);
    assertEquals(0, dp.nextDoc());
    assertEquals(1, dp.freq());
    assertEquals(1, dp.nextPosition());
    assertEquals(8, dp.startOffset());
    assertEquals(9, dp.endOffset());
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());
    dp = MultiFields.getTermPositionsEnum(r, "content", new BytesRef("c"));
    assertNotNull(dp);
    assertEquals(0, dp.nextDoc());
    assertEquals(1, dp.freq());
    assertEquals(3, dp.nextPosition());
    assertEquals(19, dp.startOffset());
    assertEquals(50, dp.endOffset());
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());
    r.close();
    dir.close();
}
Also used : StringField(org.apache.lucene.document.StringField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) Token(org.apache.lucene.analysis.Token) Document(org.apache.lucene.document.Document) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) FieldType(org.apache.lucene.document.FieldType)

Example 64 with CannedTokenStream

use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.

the class TestPostingsOffsets method testLegalbutVeryLargeOffsets.

public void testLegalbutVeryLargeOffsets() throws Exception {
    Directory dir = newDirectory();
    IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
    Document doc = new Document();
    Token t1 = new Token("foo", 0, Integer.MAX_VALUE - 500);
    if (random().nextBoolean()) {
        t1.setPayload(new BytesRef("test"));
    }
    Token t2 = new Token("foo", Integer.MAX_VALUE - 500, Integer.MAX_VALUE);
    TokenStream tokenStream = new CannedTokenStream(new Token[] { t1, t2 });
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    // store some term vectors for the checkindex cross-check
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorPositions(true);
    ft.setStoreTermVectorOffsets(true);
    Field field = new Field("foo", tokenStream, ft);
    doc.add(field);
    iw.addDocument(doc);
    iw.close();
    dir.close();
}
Also used : StringField(org.apache.lucene.document.StringField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Token(org.apache.lucene.analysis.Token) Document(org.apache.lucene.document.Document) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) FieldType(org.apache.lucene.document.FieldType)

Example 65 with CannedTokenStream

use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.

the class TestPayloadsOnVectors method testMixupMultiValued.

/** some field instances have payload att, some not */
public void testMixupMultiValued() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
    customType.setStoreTermVectors(true);
    customType.setStoreTermVectorPositions(true);
    customType.setStoreTermVectorPayloads(true);
    customType.setStoreTermVectorOffsets(random().nextBoolean());
    Field field = new Field("field", "", customType);
    TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    ((Tokenizer) ts).setReader(new StringReader("here we go"));
    field.setTokenStream(ts);
    doc.add(field);
    Field field2 = new Field("field", "", customType);
    Token withPayload = new Token("withPayload", 0, 11);
    withPayload.setPayload(new BytesRef("test"));
    ts = new CannedTokenStream(withPayload);
    assertTrue(ts.hasAttribute(PayloadAttribute.class));
    field2.setTokenStream(ts);
    doc.add(field2);
    Field field3 = new Field("field", "", customType);
    ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    ((Tokenizer) ts).setReader(new StringReader("nopayload"));
    field3.setTokenStream(ts);
    doc.add(field3);
    writer.addDocument(doc);
    DirectoryReader reader = writer.getReader();
    Terms terms = reader.getTermVector(0, "field");
    assert terms != null;
    TermsEnum termsEnum = terms.iterator();
    assertTrue(termsEnum.seekExact(new BytesRef("withPayload")));
    PostingsEnum de = termsEnum.postings(null, PostingsEnum.ALL);
    assertEquals(0, de.nextDoc());
    assertEquals(3, de.nextPosition());
    assertEquals(new BytesRef("test"), de.getPayload());
    writer.close();
    reader.close();
    dir.close();
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) Token(org.apache.lucene.analysis.Token) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) StringReader(java.io.StringReader) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Aggregations

CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)78 TokenStream (org.apache.lucene.analysis.TokenStream)43 Token (org.apache.lucene.analysis.Token)37 Directory (org.apache.lucene.store.Directory)33 Document (org.apache.lucene.document.Document)26 TextField (org.apache.lucene.document.TextField)22 Field (org.apache.lucene.document.Field)15 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)14 BytesRef (org.apache.lucene.util.BytesRef)14 FieldType (org.apache.lucene.document.FieldType)13 Term (org.apache.lucene.index.Term)13 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)11 Tokenizer (org.apache.lucene.analysis.Tokenizer)11 IndexReader (org.apache.lucene.index.IndexReader)10 Analyzer (org.apache.lucene.analysis.Analyzer)9 StringField (org.apache.lucene.document.StringField)8 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)8 Reader (java.io.Reader)7 StringReader (java.io.StringReader)7 Input (org.apache.lucene.search.suggest.Input)7