Search in sources :

Example 11 with PayloadAttribute

use of org.apache.lucene.analysis.tokenattributes.PayloadAttribute in project lucene-solr by apache.

the class TestSnowball method testFilterTokens.

public void testFilterTokens() throws Exception {
    SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
    CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
    PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
    PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class);
    FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class);
    filter.incrementToken();
    assertEquals("accent", termAtt.toString());
    assertEquals(2, offsetAtt.startOffset());
    assertEquals(7, offsetAtt.endOffset());
    assertEquals("wrd", typeAtt.type());
    assertEquals(3, posIncAtt.getPositionIncrement());
    assertEquals(77, flagsAtt.getFlags());
    assertEquals(new BytesRef(new byte[] { 0, 1, 2, 3 }), payloadAtt.getPayload());
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) BytesRef(org.apache.lucene.util.BytesRef) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 12 with PayloadAttribute

use of org.apache.lucene.analysis.tokenattributes.PayloadAttribute in project lucene-solr by apache.

the class TestNGramFilters method testNGramFilterPayload.

/**
   * Test NGramFilterFactory on tokens with payloads
   */
public void testNGramFilterPayload() throws Exception {
    Reader reader = new StringReader("test|0.1");
    TokenStream stream = whitespaceMockTokenizer(reader);
    stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
    stream = tokenFilterFactory("NGram", "minGramSize", "1", "maxGramSize", "2").create(stream);
    stream.reset();
    while (stream.incrementToken()) {
        PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
        assertNotNull(payAttr);
        BytesRef payData = payAttr.getPayload();
        assertNotNull(payData);
        float payFloat = PayloadHelper.decodeFloat(payData.bytes);
        assertEquals(0.1f, payFloat, 0.0f);
    }
    stream.end();
    stream.close();
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) StringReader(java.io.StringReader) StringReader(java.io.StringReader) Reader(java.io.Reader) BytesRef(org.apache.lucene.util.BytesRef)

Example 13 with PayloadAttribute

use of org.apache.lucene.analysis.tokenattributes.PayloadAttribute in project lucene-solr by apache.

the class TestDocumentWriter method testTokenReuse.

public void testTokenReuse() throws IOException {
    Analyzer analyzer = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new TokenFilter(tokenizer) {

                boolean first = true;

                AttributeSource.State state;

                @Override
                public boolean incrementToken() throws IOException {
                    if (state != null) {
                        restoreState(state);
                        payloadAtt.setPayload(null);
                        posIncrAtt.setPositionIncrement(0);
                        termAtt.setEmpty().append("b");
                        state = null;
                        return true;
                    }
                    boolean hasNext = input.incrementToken();
                    if (!hasNext)
                        return false;
                    if (Character.isDigit(termAtt.buffer()[0])) {
                        posIncrAtt.setPositionIncrement(termAtt.buffer()[0] - '0');
                    }
                    if (first) {
                        // set payload on first position only
                        payloadAtt.setPayload(new BytesRef(new byte[] { 100 }));
                        first = false;
                    }
                    // index a "synonym" for every token
                    state = captureState();
                    return true;
                }

                @Override
                public void reset() throws IOException {
                    super.reset();
                    first = true;
                    state = null;
                }

                final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

                final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);

                final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
            });
        }
    };
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer));
    Document doc = new Document();
    doc.add(newTextField("f1", "a 5 a a", Field.Store.YES));
    writer.addDocument(doc);
    writer.commit();
    SegmentCommitInfo info = writer.newestSegment();
    writer.close();
    SegmentReader reader = new SegmentReader(info, Version.LATEST.major, newIOContext(random()));
    PostingsEnum termPositions = MultiFields.getTermPositionsEnum(reader, "f1", new BytesRef("a"));
    assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    int freq = termPositions.freq();
    assertEquals(3, freq);
    assertEquals(0, termPositions.nextPosition());
    assertNotNull(termPositions.getPayload());
    assertEquals(6, termPositions.nextPosition());
    assertNull(termPositions.getPayload());
    assertEquals(7, termPositions.nextPosition());
    assertNull(termPositions.getPayload());
    reader.close();
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) AttributeSource(org.apache.lucene.util.AttributeSource) IOException(java.io.IOException) Document(org.apache.lucene.document.Document) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) BytesRef(org.apache.lucene.util.BytesRef)

Example 14 with PayloadAttribute

use of org.apache.lucene.analysis.tokenattributes.PayloadAttribute in project jackrabbit by apache.

the class SearchIndex method mergeAggregatedNodeIndexes.

/**
     * Merges the fulltext indexed fields of the aggregated node states into
     * <code>doc</code>.
     *
     * @param state the node state on which <code>doc</code> was created.
     * @param doc the lucene document with index fields from <code>state</code>.
     * @param ifv the current index format version.
     */
protected void mergeAggregatedNodeIndexes(NodeState state, Document doc, IndexFormatVersion ifv) {
    if (indexingConfig != null) {
        AggregateRule[] aggregateRules = indexingConfig.getAggregateRules();
        if (aggregateRules == null) {
            return;
        }
        try {
            ItemStateManager ism = getContext().getItemStateManager();
            for (AggregateRule aggregateRule : aggregateRules) {
                boolean ruleMatched = false;
                // node includes
                NodeState[] aggregates = aggregateRule.getAggregatedNodeStates(state);
                if (aggregates != null) {
                    ruleMatched = true;
                    for (NodeState aggregate : aggregates) {
                        Document aDoc = createDocument(aggregate, getNamespaceMappings(), ifv);
                        // transfer fields to doc if there are any
                        Fieldable[] fulltextFields = aDoc.getFieldables(FieldNames.FULLTEXT);
                        if (fulltextFields != null) {
                            for (Fieldable fulltextField : fulltextFields) {
                                doc.add(fulltextField);
                            }
                            doc.add(new Field(FieldNames.AGGREGATED_NODE_UUID, false, aggregate.getNodeId().toString(), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
                        }
                    }
                    // make sure that fulltext fields are aligned properly
                    // first all stored fields, then remaining
                    Fieldable[] fulltextFields = doc.getFieldables(FieldNames.FULLTEXT);
                    doc.removeFields(FieldNames.FULLTEXT);
                    Arrays.sort(fulltextFields, FIELDS_COMPARATOR_STORED);
                    for (Fieldable f : fulltextFields) {
                        doc.add(f);
                    }
                }
                // property includes
                PropertyState[] propStates = aggregateRule.getAggregatedPropertyStates(state);
                if (propStates != null) {
                    ruleMatched = true;
                    for (PropertyState propState : propStates) {
                        String namePrefix = FieldNames.createNamedValue(getNamespaceMappings().translateName(propState.getName()), "");
                        NodeState parent = (NodeState) ism.getItemState(propState.getParentId());
                        Document aDoc = createDocument(parent, getNamespaceMappings(), ifv);
                        try {
                            // find the right fields to transfer
                            Fieldable[] fields = aDoc.getFieldables(FieldNames.PROPERTIES);
                            for (Fieldable field : fields) {
                                // assume properties fields use SingleTokenStream
                                TokenStream tokenStream = field.tokenStreamValue();
                                TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class);
                                PayloadAttribute payloadAttribute = tokenStream.addAttribute(PayloadAttribute.class);
                                tokenStream.incrementToken();
                                tokenStream.end();
                                tokenStream.close();
                                String value = new String(termAttribute.termBuffer(), 0, termAttribute.termLength());
                                if (value.startsWith(namePrefix)) {
                                    // extract value
                                    String rawValue = value.substring(namePrefix.length());
                                    // create new named value
                                    Path p = getRelativePath(state, propState);
                                    String path = getNamespaceMappings().translatePath(p);
                                    value = FieldNames.createNamedValue(path, rawValue);
                                    termAttribute.setTermBuffer(value);
                                    PropertyMetaData pdm = PropertyMetaData.fromByteArray(payloadAttribute.getPayload().getData());
                                    doc.add(new Field(field.name(), new SingletonTokenStream(value, pdm.getPropertyType())));
                                    doc.add(new Field(FieldNames.AGGREGATED_NODE_UUID, false, parent.getNodeId().toString(), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
                                    if (pdm.getPropertyType() == PropertyType.STRING) {
                                        // add to fulltext index
                                        Field ft = new Field(FieldNames.FULLTEXT, false, rawValue, Field.Store.YES, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.NO);
                                        doc.add(ft);
                                    }
                                }
                            }
                        } finally {
                            Util.disposeDocument(aDoc);
                        }
                    }
                }
                // only use first aggregate definition that matches
                if (ruleMatched) {
                    break;
                }
            }
        } catch (NoSuchItemStateException e) {
            // do not fail if aggregate cannot be created
            log.info("Exception while building indexing aggregate for {}. Node is not available {}.", state.getNodeId(), e.getMessage());
        } catch (Exception e) {
            // do not fail if aggregate cannot be created
            log.warn("Exception while building indexing aggregate for " + state.getNodeId(), e);
        }
    }
}
Also used : Path(org.apache.jackrabbit.spi.Path) TokenStream(org.apache.lucene.analysis.TokenStream) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) NodeState(org.apache.jackrabbit.core.state.NodeState) Document(org.apache.lucene.document.Document) FileSystemException(org.apache.jackrabbit.core.fs.FileSystemException) SAXException(org.xml.sax.SAXException) JournalException(org.apache.jackrabbit.core.journal.JournalException) NoSuchItemStateException(org.apache.jackrabbit.core.state.NoSuchItemStateException) RepositoryException(javax.jcr.RepositoryException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ItemStateException(org.apache.jackrabbit.core.state.ItemStateException) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) InvalidQueryException(javax.jcr.query.InvalidQueryException) PropertyState(org.apache.jackrabbit.core.state.PropertyState) SortField(org.apache.lucene.search.SortField) Field(org.apache.lucene.document.Field) NoSuchItemStateException(org.apache.jackrabbit.core.state.NoSuchItemStateException) Fieldable(org.apache.lucene.document.Fieldable) ItemStateManager(org.apache.jackrabbit.core.state.ItemStateManager) TermAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute)

Example 15 with PayloadAttribute

use of org.apache.lucene.analysis.tokenattributes.PayloadAttribute in project jackrabbit by apache.

the class AbstractIndex method getFinishedDocument.

/**
     * Returns a document that is finished with text extraction and is ready to
     * be added to the index.
     *
     * @param doc the document to check.
     * @return <code>doc</code> if it is finished already or a stripped down
     *         copy of <code>doc</code> without text extractors.
     * @throws IOException if the document cannot be added to the indexing
     *                     queue.
     */
private Document getFinishedDocument(Document doc) throws IOException {
    if (!Util.isDocumentReady(doc)) {
        Document copy = new Document();
        // mark the document that reindexing is required
        copy.add(new Field(FieldNames.REINDEXING_REQUIRED, false, "", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
        for (Fieldable f : doc.getFields()) {
            Fieldable field = null;
            Field.TermVector tv = getTermVectorParameter(f);
            Field.Store stored = f.isStored() ? Field.Store.YES : Field.Store.NO;
            Field.Index indexed = getIndexParameter(f);
            if (f instanceof LazyTextExtractorField || f.readerValue() != null) {
                // replace all readers with empty string reader
                field = new Field(f.name(), new StringReader(""), tv);
            } else if (f.stringValue() != null) {
                field = new Field(f.name(), false, f.stringValue(), stored, indexed, tv);
            } else if (f.isBinary()) {
                field = new Field(f.name(), f.getBinaryValue(), stored);
            } else if (f.tokenStreamValue() != null && f.tokenStreamValue() instanceof SingletonTokenStream) {
                TokenStream tokenStream = f.tokenStreamValue();
                TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class);
                PayloadAttribute payloadAttribute = tokenStream.addAttribute(PayloadAttribute.class);
                tokenStream.incrementToken();
                String value = new String(termAttribute.termBuffer(), 0, termAttribute.termLength());
                tokenStream.reset();
                field = new Field(f.name(), new SingletonTokenStream(value, (Payload) payloadAttribute.getPayload().clone()));
            }
            if (field != null) {
                field.setOmitNorms(f.getOmitNorms());
                copy.add(field);
            }
        }
        // schedule the original document for later indexing
        Document existing = indexingQueue.addDocument(doc);
        if (existing != null) {
            // the queue already contained a pending document for this
            // node. -> dispose the document
            Util.disposeDocument(existing);
        }
        // use the stripped down copy for now
        doc = copy;
    }
    return doc;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) Document(org.apache.lucene.document.Document) Field(org.apache.lucene.document.Field) Fieldable(org.apache.lucene.document.Fieldable) StringReader(java.io.StringReader) TermAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute) Payload(org.apache.lucene.index.Payload)

Aggregations

PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)27 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)16 TokenStream (org.apache.lucene.analysis.TokenStream)14 BytesRef (org.apache.lucene.util.BytesRef)13 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)12 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)11 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)10 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)7 StringReader (java.io.StringReader)6 IOException (java.io.IOException)5 Document (org.apache.lucene.document.Document)5 Reader (java.io.Reader)4 Token (org.apache.lucene.analysis.Token)4 Field (org.apache.lucene.document.Field)4 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)3 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)3 LinkedHashMap (java.util.LinkedHashMap)2 LinkedList (java.util.LinkedList)2 Map (java.util.Map)2 TreeMap (java.util.TreeMap)2