Search in sources :

Example 46 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestTrimFilter method testTrim.

public void testTrim() throws Exception {
    char[] a = " a ".toCharArray();
    char[] b = "b   ".toCharArray();
    char[] ccc = "cCc".toCharArray();
    char[] whitespace = "   ".toCharArray();
    char[] empty = "".toCharArray();
    TokenStream ts = new CannedTokenStream(new Token(new String(a, 0, a.length), 1, 5), new Token(new String(b, 0, b.length), 6, 10), new Token(new String(ccc, 0, ccc.length), 11, 15), new Token(new String(whitespace, 0, whitespace.length), 16, 20), new Token(new String(empty, 0, empty.length), 21, 21));
    ts = new TrimFilter(ts);
    assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", "" });
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Token(org.apache.lucene.analysis.Token) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream)

Example 47 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestRemoveDuplicatesTokenFilter method tok.

public static Token tok(int pos, String t, int start, int end) {
    Token tok = new Token(t, start, end);
    tok.setPositionIncrement(pos);
    return tok;
}
Also used : Token(org.apache.lucene.analysis.Token)

Example 48 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestRemoveDuplicatesTokenFilterFactory method tok.

public static Token tok(int pos, String t, int start, int end) {
    Token tok = new Token(t, start, end);
    tok.setPositionIncrement(pos);
    return tok;
}
Also used : Token(org.apache.lucene.analysis.Token)

Example 49 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestPayloadsOnVectors method testMixupDocs.

/** some docs have payload att, some not */
public void testMixupDocs() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
    iwc.setMergePolicy(newLogMergePolicy());
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
    Document doc = new Document();
    FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
    customType.setStoreTermVectors(true);
    customType.setStoreTermVectorPositions(true);
    customType.setStoreTermVectorPayloads(true);
    customType.setStoreTermVectorOffsets(random().nextBoolean());
    Field field = new Field("field", "", customType);
    TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    ((Tokenizer) ts).setReader(new StringReader("here we go"));
    field.setTokenStream(ts);
    doc.add(field);
    writer.addDocument(doc);
    Token withPayload = new Token("withPayload", 0, 11);
    withPayload.setPayload(new BytesRef("test"));
    ts = new CannedTokenStream(withPayload);
    assertTrue(ts.hasAttribute(PayloadAttribute.class));
    field.setTokenStream(ts);
    writer.addDocument(doc);
    ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    ((Tokenizer) ts).setReader(new StringReader("another"));
    field.setTokenStream(ts);
    writer.addDocument(doc);
    DirectoryReader reader = writer.getReader();
    Terms terms = reader.getTermVector(1, "field");
    assert terms != null;
    TermsEnum termsEnum = terms.iterator();
    assertTrue(termsEnum.seekExact(new BytesRef("withPayload")));
    PostingsEnum de = termsEnum.postings(null, PostingsEnum.ALL);
    assertEquals(0, de.nextDoc());
    assertEquals(0, de.nextPosition());
    assertEquals(new BytesRef("test"), de.getPayload());
    writer.close();
    reader.close();
    dir.close();
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) Token(org.apache.lucene.analysis.Token) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) StringReader(java.io.StringReader) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Example 50 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestMaxPosition method testTooBigPosition.

public void testTooBigPosition() throws Exception {
    Directory dir = newDirectory();
    IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
    Document doc = new Document();
    // This is at position 1:
    Token t1 = new Token("foo", 0, 3);
    t1.setPositionIncrement(2);
    if (random().nextBoolean()) {
        t1.setPayload(new BytesRef(new byte[] { 0x1 }));
    }
    Token t2 = new Token("foo", 4, 7);
    // This should overflow max:
    t2.setPositionIncrement(IndexWriter.MAX_POSITION);
    if (random().nextBoolean()) {
        t2.setPayload(new BytesRef(new byte[] { 0x1 }));
    }
    doc.add(new TextField("foo", new CannedTokenStream(new Token[] { t1, t2 })));
    expectThrows(IllegalArgumentException.class, () -> {
        iw.addDocument(doc);
    });
    // Document should not be visible:
    IndexReader r = DirectoryReader.open(iw);
    assertEquals(0, r.numDocs());
    r.close();
    iw.close();
    dir.close();
}
Also used : TextField(org.apache.lucene.document.TextField) Token(org.apache.lucene.analysis.Token) Document(org.apache.lucene.document.Document) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Aggregations

Token (org.apache.lucene.analysis.Token)100 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)39 TokenStream (org.apache.lucene.analysis.TokenStream)31 Directory (org.apache.lucene.store.Directory)24 Test (org.junit.Test)23 Document (org.apache.lucene.document.Document)19 TextField (org.apache.lucene.document.TextField)19 BytesRef (org.apache.lucene.util.BytesRef)16 NamedList (org.apache.solr.common.util.NamedList)16 StringReader (java.io.StringReader)15 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)15 Analyzer (org.apache.lucene.analysis.Analyzer)14 ArrayList (java.util.ArrayList)13 Map (java.util.Map)13 Field (org.apache.lucene.document.Field)13 FieldType (org.apache.lucene.document.FieldType)11 IndexReader (org.apache.lucene.index.IndexReader)11 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)10 Tokenizer (org.apache.lucene.analysis.Tokenizer)9 Date (java.util.Date)8