Search in sources :

Example 91 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestPostingsOffsets method testBasic.

public void testBasic() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
    Document doc = new Document();
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    if (random().nextBoolean()) {
        ft.setStoreTermVectors(true);
        ft.setStoreTermVectorPositions(random().nextBoolean());
        ft.setStoreTermVectorOffsets(random().nextBoolean());
    }
    Token[] tokens = new Token[] { makeToken("a", 1, 0, 6), makeToken("b", 1, 8, 9), makeToken("a", 1, 9, 17), makeToken("c", 1, 19, 50) };
    doc.add(new Field("content", new CannedTokenStream(tokens), ft));
    w.addDocument(doc);
    IndexReader r = w.getReader();
    w.close();
    PostingsEnum dp = MultiFields.getTermPositionsEnum(r, "content", new BytesRef("a"));
    assertNotNull(dp);
    assertEquals(0, dp.nextDoc());
    assertEquals(2, dp.freq());
    assertEquals(0, dp.nextPosition());
    assertEquals(0, dp.startOffset());
    assertEquals(6, dp.endOffset());
    assertEquals(2, dp.nextPosition());
    assertEquals(9, dp.startOffset());
    assertEquals(17, dp.endOffset());
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());
    dp = MultiFields.getTermPositionsEnum(r, "content", new BytesRef("b"));
    assertNotNull(dp);
    assertEquals(0, dp.nextDoc());
    assertEquals(1, dp.freq());
    assertEquals(1, dp.nextPosition());
    assertEquals(8, dp.startOffset());
    assertEquals(9, dp.endOffset());
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());
    dp = MultiFields.getTermPositionsEnum(r, "content", new BytesRef("c"));
    assertNotNull(dp);
    assertEquals(0, dp.nextDoc());
    assertEquals(1, dp.freq());
    assertEquals(3, dp.nextPosition());
    assertEquals(19, dp.startOffset());
    assertEquals(50, dp.endOffset());
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());
    r.close();
    dir.close();
}
Also used : StringField(org.apache.lucene.document.StringField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) Token(org.apache.lucene.analysis.Token) Document(org.apache.lucene.document.Document) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) FieldType(org.apache.lucene.document.FieldType)

Example 92 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestPostingsOffsets method makeToken.

private Token makeToken(String text, int posIncr, int startOffset, int endOffset) {
    final Token t = new Token();
    t.append(text);
    t.setPositionIncrement(posIncr);
    t.setOffset(startOffset, endOffset);
    return t;
}
Also used : Token(org.apache.lucene.analysis.Token)

Example 93 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestPostingsOffsets method testLegalbutVeryLargeOffsets.

public void testLegalbutVeryLargeOffsets() throws Exception {
    Directory dir = newDirectory();
    IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
    Document doc = new Document();
    Token t1 = new Token("foo", 0, Integer.MAX_VALUE - 500);
    if (random().nextBoolean()) {
        t1.setPayload(new BytesRef("test"));
    }
    Token t2 = new Token("foo", Integer.MAX_VALUE - 500, Integer.MAX_VALUE);
    TokenStream tokenStream = new CannedTokenStream(new Token[] { t1, t2 });
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    // store some term vectors for the checkindex cross-check
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorPositions(true);
    ft.setStoreTermVectorOffsets(true);
    Field field = new Field("foo", tokenStream, ft);
    doc.add(field);
    iw.addDocument(doc);
    iw.close();
    dir.close();
}
Also used : StringField(org.apache.lucene.document.StringField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Token(org.apache.lucene.analysis.Token) Document(org.apache.lucene.document.Document) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) FieldType(org.apache.lucene.document.FieldType)

Example 94 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestPayloadsOnVectors method testMixupMultiValued.

/** some field instances have payload att, some not */
public void testMixupMultiValued() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
    customType.setStoreTermVectors(true);
    customType.setStoreTermVectorPositions(true);
    customType.setStoreTermVectorPayloads(true);
    customType.setStoreTermVectorOffsets(random().nextBoolean());
    Field field = new Field("field", "", customType);
    TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    ((Tokenizer) ts).setReader(new StringReader("here we go"));
    field.setTokenStream(ts);
    doc.add(field);
    Field field2 = new Field("field", "", customType);
    Token withPayload = new Token("withPayload", 0, 11);
    withPayload.setPayload(new BytesRef("test"));
    ts = new CannedTokenStream(withPayload);
    assertTrue(ts.hasAttribute(PayloadAttribute.class));
    field2.setTokenStream(ts);
    doc.add(field2);
    Field field3 = new Field("field", "", customType);
    ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    ((Tokenizer) ts).setReader(new StringReader("nopayload"));
    field3.setTokenStream(ts);
    doc.add(field3);
    writer.addDocument(doc);
    DirectoryReader reader = writer.getReader();
    Terms terms = reader.getTermVector(0, "field");
    assert terms != null;
    TermsEnum termsEnum = terms.iterator();
    assertTrue(termsEnum.seekExact(new BytesRef("withPayload")));
    PostingsEnum de = termsEnum.postings(null, PostingsEnum.ALL);
    assertEquals(0, de.nextDoc());
    assertEquals(3, de.nextPosition());
    assertEquals(new BytesRef("test"), de.getPayload());
    writer.close();
    reader.close();
    dir.close();
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) Token(org.apache.lucene.analysis.Token) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) StringReader(java.io.StringReader) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Example 95 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestPayloads method testMixupDocs.

/** some docs have payload att, some not */
public void testMixupDocs() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig(null);
    iwc.setMergePolicy(newLogMergePolicy());
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
    Document doc = new Document();
    Field field = new TextField("field", "", Field.Store.NO);
    TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    ((Tokenizer) ts).setReader(new StringReader("here we go"));
    field.setTokenStream(ts);
    doc.add(field);
    writer.addDocument(doc);
    Token withPayload = new Token("withPayload", 0, 11);
    withPayload.setPayload(new BytesRef("test"));
    ts = new CannedTokenStream(withPayload);
    assertTrue(ts.hasAttribute(PayloadAttribute.class));
    field.setTokenStream(ts);
    writer.addDocument(doc);
    ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    ((Tokenizer) ts).setReader(new StringReader("another"));
    field.setTokenStream(ts);
    writer.addDocument(doc);
    DirectoryReader reader = writer.getReader();
    TermsEnum te = MultiFields.getFields(reader).terms("field").iterator();
    assertTrue(te.seekExact(new BytesRef("withPayload")));
    PostingsEnum de = te.postings(null, PostingsEnum.PAYLOADS);
    de.nextDoc();
    de.nextPosition();
    assertEquals(new BytesRef("test"), de.getPayload());
    writer.close();
    reader.close();
    dir.close();
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) Token(org.apache.lucene.analysis.Token) Document(org.apache.lucene.document.Document) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) StringReader(java.io.StringReader) TextField(org.apache.lucene.document.TextField) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Aggregations

Token (org.apache.lucene.analysis.Token)100 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)39 TokenStream (org.apache.lucene.analysis.TokenStream)31 Directory (org.apache.lucene.store.Directory)24 Test (org.junit.Test)23 Document (org.apache.lucene.document.Document)19 TextField (org.apache.lucene.document.TextField)19 BytesRef (org.apache.lucene.util.BytesRef)16 NamedList (org.apache.solr.common.util.NamedList)16 StringReader (java.io.StringReader)15 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)15 Analyzer (org.apache.lucene.analysis.Analyzer)14 ArrayList (java.util.ArrayList)13 Map (java.util.Map)13 Field (org.apache.lucene.document.Field)13 FieldType (org.apache.lucene.document.FieldType)11 IndexReader (org.apache.lucene.index.IndexReader)11 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)10 Tokenizer (org.apache.lucene.analysis.Tokenizer)9 Date (java.util.Date)8