Search in sources :

Example 16 with PayloadAttribute

use of org.apache.lucene.analysis.tokenattributes.PayloadAttribute in project lucene-solr by apache.

the class SimpleQueryConverter method convert.

@Override
public Collection<Token> convert(String origQuery) {
    Collection<Token> result = new HashSet<>();
    WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
    try (TokenStream ts = analyzer.tokenStream("", origQuery)) {
        // TODO: support custom attributes
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
        FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
        PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            Token tok = new Token();
            tok.copyBuffer(termAtt.buffer(), 0, termAtt.length());
            tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
            tok.setFlags(flagsAtt.getFlags());
            tok.setPayload(payloadAtt.getPayload());
            tok.setPositionIncrement(posIncAtt.getPositionIncrement());
            tok.setType(typeAtt.type());
            result.add(tok);
        }
        ts.end();
        return result;
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
Also used : WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) TokenStream(org.apache.lucene.analysis.TokenStream) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) Token(org.apache.lucene.analysis.Token) IOException(java.io.IOException) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) HashSet(java.util.HashSet)

Example 17 with PayloadAttribute

use of org.apache.lucene.analysis.tokenattributes.PayloadAttribute in project lucene-solr by apache.

the class TypeAsPayloadTokenFilterTest method test.

public void test() throws IOException {
    String test = "The quick red fox jumped over the lazy brown dogs";
    TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(whitespaceMockTokenizer(test)));
    int count = 0;
    CharTermAttribute termAtt = nptf.getAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = nptf.getAttribute(TypeAttribute.class);
    PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
    nptf.reset();
    while (nptf.incrementToken()) {
        assertTrue(typeAtt.type() + " is not null and it should be", typeAtt.type().equals(String.valueOf(Character.toUpperCase(termAtt.buffer()[0]))));
        assertTrue("nextToken.getPayload() is null and it shouldn't be", payloadAtt.getPayload() != null);
        String type = payloadAtt.getPayload().utf8ToString();
        assertTrue(type + " is not equal to " + typeAtt.type(), type.equals(typeAtt.type()));
        count++;
    }
    assertTrue(count + " does not equal: " + 10, count == 10);
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute)

Example 18 with PayloadAttribute

use of org.apache.lucene.analysis.tokenattributes.PayloadAttribute in project lucene-solr by apache.

the class TokenOffsetPayloadTokenFilterTest method test.

public void test() throws IOException {
    String test = "The quick red fox jumped over the lazy brown dogs";
    TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(whitespaceMockTokenizer(test));
    int count = 0;
    PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
    OffsetAttribute offsetAtt = nptf.getAttribute(OffsetAttribute.class);
    nptf.reset();
    while (nptf.incrementToken()) {
        BytesRef pay = payloadAtt.getPayload();
        assertTrue("pay is null and it shouldn't be", pay != null);
        byte[] data = pay.bytes;
        int start = PayloadHelper.decodeInt(data, 0);
        assertTrue(start + " does not equal: " + offsetAtt.startOffset(), start == offsetAtt.startOffset());
        int end = PayloadHelper.decodeInt(data, 4);
        assertTrue(end + " does not equal: " + offsetAtt.endOffset(), end == offsetAtt.endOffset());
        count++;
    }
    assertTrue(count + " does not equal: " + 10, count == 10);
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) BytesRef(org.apache.lucene.util.BytesRef)

Example 19 with PayloadAttribute

use of org.apache.lucene.analysis.tokenattributes.PayloadAttribute in project lucene-solr by apache.

the class TestNGramFilters method testEdgeNGramFilterPayload.

/**
   * Test EdgeNGramFilterFactory on tokens with payloads
   */
public void testEdgeNGramFilterPayload() throws Exception {
    Reader reader = new StringReader("test|0.1");
    TokenStream stream = whitespaceMockTokenizer(reader);
    stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
    stream = tokenFilterFactory("EdgeNGram", "minGramSize", "1", "maxGramSize", "2").create(stream);
    stream.reset();
    while (stream.incrementToken()) {
        PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
        assertNotNull(payAttr);
        BytesRef payData = payAttr.getPayload();
        assertNotNull(payData);
        float payFloat = PayloadHelper.decodeFloat(payData.bytes);
        assertEquals(0.1f, payFloat, 0.0f);
    }
    stream.end();
    stream.close();
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) StringReader(java.io.StringReader) StringReader(java.io.StringReader) Reader(java.io.Reader) BytesRef(org.apache.lucene.util.BytesRef)

Example 20 with PayloadAttribute

use of org.apache.lucene.analysis.tokenattributes.PayloadAttribute in project lucene-solr by apache.

the class TokenSourcesTest method testRandomizedRoundTrip.

@Repeat(iterations = 10)
public //@Seed("947083AB20AB2D4F")
void testRandomizedRoundTrip() throws Exception {
    final int distinct = TestUtil.nextInt(random(), 1, 10);
    String[] terms = new String[distinct];
    BytesRef[] termBytes = new BytesRef[distinct];
    for (int i = 0; i < distinct; ++i) {
        terms[i] = TestUtil.randomRealisticUnicodeString(random());
        termBytes[i] = new BytesRef(terms[i]);
    }
    final BaseTermVectorsFormatTestCase.RandomTokenStream rTokenStream = new BaseTermVectorsFormatTestCase.RandomTokenStream(TestUtil.nextInt(random(), 1, 10), terms, termBytes);
    //check to see if the token streams might have non-deterministic testable result
    final boolean storeTermVectorPositions = random().nextBoolean();
    final int[] startOffsets = rTokenStream.getStartOffsets();
    final int[] positionsIncrements = rTokenStream.getPositionsIncrements();
    for (int i = 1; i < positionsIncrements.length; i++) {
        if (storeTermVectorPositions && positionsIncrements[i] != 0) {
            continue;
        }
        // than previous token's endOffset?  That would increase the testable possibilities.
        if (startOffsets[i] == startOffsets[i - 1]) {
            if (VERBOSE)
                System.out.println("Skipping test because can't easily validate random token-stream is correct.");
            return;
        }
    }
    //sanity check itself
    assertTokenStreamContents(rTokenStream, rTokenStream.getTerms(), rTokenStream.getStartOffsets(), rTokenStream.getEndOffsets(), rTokenStream.getPositionsIncrements());
    Directory dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
    myFieldType.setStoreTermVectors(true);
    myFieldType.setStoreTermVectorOffsets(true);
    myFieldType.setStoreTermVectorPositions(storeTermVectorPositions);
    //payloads require positions; it will throw an error otherwise
    myFieldType.setStoreTermVectorPayloads(storeTermVectorPositions && random().nextBoolean());
    Document doc = new Document();
    doc.add(new Field("field", rTokenStream, myFieldType));
    writer.addDocument(doc);
    IndexReader reader = writer.getReader();
    writer.close();
    assertEquals(1, reader.numDocs());
    TokenStream vectorTokenStream = TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);
    //sometimes check payloads
    PayloadAttribute payloadAttribute = null;
    if (myFieldType.storeTermVectorPayloads() && usually()) {
        payloadAttribute = vectorTokenStream.addAttribute(PayloadAttribute.class);
    }
    assertTokenStreamContents(vectorTokenStream, rTokenStream.getTerms(), rTokenStream.getStartOffsets(), rTokenStream.getEndOffsets(), myFieldType.storeTermVectorPositions() ? rTokenStream.getPositionsIncrements() : null);
    //test payloads
    if (payloadAttribute != null) {
        vectorTokenStream.reset();
        for (int i = 0; vectorTokenStream.incrementToken(); i++) {
            assertEquals(rTokenStream.getPayloads()[i], payloadAttribute.getPayload());
        }
    }
    reader.close();
    dir.close();
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) BaseTermVectorsFormatTestCase(org.apache.lucene.index.BaseTermVectorsFormatTestCase) IndexReader(org.apache.lucene.index.IndexReader) BytesRef(org.apache.lucene.util.BytesRef) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory) Repeat(com.carrotsearch.randomizedtesting.annotations.Repeat)

Aggregations

PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)27 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)16 TokenStream (org.apache.lucene.analysis.TokenStream)14 BytesRef (org.apache.lucene.util.BytesRef)13 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)12 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)11 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)10 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)7 StringReader (java.io.StringReader)6 IOException (java.io.IOException)5 Document (org.apache.lucene.document.Document)5 Reader (java.io.Reader)4 Token (org.apache.lucene.analysis.Token)4 Field (org.apache.lucene.document.Field)4 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)3 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)3 LinkedHashMap (java.util.LinkedHashMap)2 LinkedList (java.util.LinkedList)2 Map (java.util.Map)2 TreeMap (java.util.TreeMap)2