use of org.apache.lucene.analysis.tokenattributes.PayloadAttribute in project lucene-solr by apache.
the class TestSnowball method testFilterTokens.
public void testFilterTokens() throws Exception {
SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class);
FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class);
filter.incrementToken();
assertEquals("accent", termAtt.toString());
assertEquals(2, offsetAtt.startOffset());
assertEquals(7, offsetAtt.endOffset());
assertEquals("wrd", typeAtt.type());
assertEquals(3, posIncAtt.getPositionIncrement());
assertEquals(77, flagsAtt.getFlags());
assertEquals(new BytesRef(new byte[] { 0, 1, 2, 3 }), payloadAtt.getPayload());
}
use of org.apache.lucene.analysis.tokenattributes.PayloadAttribute in project lucene-solr by apache.
the class TestNGramFilters method testNGramFilterPayload.
/**
* Test NGramFilterFactory on tokens with payloads
*/
public void testNGramFilterPayload() throws Exception {
Reader reader = new StringReader("test|0.1");
TokenStream stream = whitespaceMockTokenizer(reader);
stream = tokenFilterFactory("DelimitedPayload", "encoder", "float").create(stream);
stream = tokenFilterFactory("NGram", "minGramSize", "1", "maxGramSize", "2").create(stream);
stream.reset();
while (stream.incrementToken()) {
PayloadAttribute payAttr = stream.getAttribute(PayloadAttribute.class);
assertNotNull(payAttr);
BytesRef payData = payAttr.getPayload();
assertNotNull(payData);
float payFloat = PayloadHelper.decodeFloat(payData.bytes);
assertEquals(0.1f, payFloat, 0.0f);
}
stream.end();
stream.close();
}
use of org.apache.lucene.analysis.tokenattributes.PayloadAttribute in project lucene-solr by apache.
the class TestDocumentWriter method testTokenReuse.
public void testTokenReuse() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new TokenFilter(tokenizer) {
boolean first = true;
AttributeSource.State state;
@Override
public boolean incrementToken() throws IOException {
if (state != null) {
restoreState(state);
payloadAtt.setPayload(null);
posIncrAtt.setPositionIncrement(0);
termAtt.setEmpty().append("b");
state = null;
return true;
}
boolean hasNext = input.incrementToken();
if (!hasNext)
return false;
if (Character.isDigit(termAtt.buffer()[0])) {
posIncrAtt.setPositionIncrement(termAtt.buffer()[0] - '0');
}
if (first) {
// set payload on first position only
payloadAtt.setPayload(new BytesRef(new byte[] { 100 }));
first = false;
}
// index a "synonym" for every token
state = captureState();
return true;
}
@Override
public void reset() throws IOException {
super.reset();
first = true;
state = null;
}
final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
});
}
};
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer));
Document doc = new Document();
doc.add(newTextField("f1", "a 5 a a", Field.Store.YES));
writer.addDocument(doc);
writer.commit();
SegmentCommitInfo info = writer.newestSegment();
writer.close();
SegmentReader reader = new SegmentReader(info, Version.LATEST.major, newIOContext(random()));
PostingsEnum termPositions = MultiFields.getTermPositionsEnum(reader, "f1", new BytesRef("a"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
int freq = termPositions.freq();
assertEquals(3, freq);
assertEquals(0, termPositions.nextPosition());
assertNotNull(termPositions.getPayload());
assertEquals(6, termPositions.nextPosition());
assertNull(termPositions.getPayload());
assertEquals(7, termPositions.nextPosition());
assertNull(termPositions.getPayload());
reader.close();
}
use of org.apache.lucene.analysis.tokenattributes.PayloadAttribute in project jackrabbit by apache.
the class SearchIndex method mergeAggregatedNodeIndexes.
/**
* Merges the fulltext indexed fields of the aggregated node states into
* <code>doc</code>.
*
* @param state the node state on which <code>doc</code> was created.
* @param doc the lucene document with index fields from <code>state</code>.
* @param ifv the current index format version.
*/
protected void mergeAggregatedNodeIndexes(NodeState state, Document doc, IndexFormatVersion ifv) {
if (indexingConfig != null) {
AggregateRule[] aggregateRules = indexingConfig.getAggregateRules();
if (aggregateRules == null) {
return;
}
try {
ItemStateManager ism = getContext().getItemStateManager();
for (AggregateRule aggregateRule : aggregateRules) {
boolean ruleMatched = false;
// node includes
NodeState[] aggregates = aggregateRule.getAggregatedNodeStates(state);
if (aggregates != null) {
ruleMatched = true;
for (NodeState aggregate : aggregates) {
Document aDoc = createDocument(aggregate, getNamespaceMappings(), ifv);
// transfer fields to doc if there are any
Fieldable[] fulltextFields = aDoc.getFieldables(FieldNames.FULLTEXT);
if (fulltextFields != null) {
for (Fieldable fulltextField : fulltextFields) {
doc.add(fulltextField);
}
doc.add(new Field(FieldNames.AGGREGATED_NODE_UUID, false, aggregate.getNodeId().toString(), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
}
}
// make sure that fulltext fields are aligned properly
// first all stored fields, then remaining
Fieldable[] fulltextFields = doc.getFieldables(FieldNames.FULLTEXT);
doc.removeFields(FieldNames.FULLTEXT);
Arrays.sort(fulltextFields, FIELDS_COMPARATOR_STORED);
for (Fieldable f : fulltextFields) {
doc.add(f);
}
}
// property includes
PropertyState[] propStates = aggregateRule.getAggregatedPropertyStates(state);
if (propStates != null) {
ruleMatched = true;
for (PropertyState propState : propStates) {
String namePrefix = FieldNames.createNamedValue(getNamespaceMappings().translateName(propState.getName()), "");
NodeState parent = (NodeState) ism.getItemState(propState.getParentId());
Document aDoc = createDocument(parent, getNamespaceMappings(), ifv);
try {
// find the right fields to transfer
Fieldable[] fields = aDoc.getFieldables(FieldNames.PROPERTIES);
for (Fieldable field : fields) {
// assume properties fields use SingleTokenStream
TokenStream tokenStream = field.tokenStreamValue();
TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class);
PayloadAttribute payloadAttribute = tokenStream.addAttribute(PayloadAttribute.class);
tokenStream.incrementToken();
tokenStream.end();
tokenStream.close();
String value = new String(termAttribute.termBuffer(), 0, termAttribute.termLength());
if (value.startsWith(namePrefix)) {
// extract value
String rawValue = value.substring(namePrefix.length());
// create new named value
Path p = getRelativePath(state, propState);
String path = getNamespaceMappings().translatePath(p);
value = FieldNames.createNamedValue(path, rawValue);
termAttribute.setTermBuffer(value);
PropertyMetaData pdm = PropertyMetaData.fromByteArray(payloadAttribute.getPayload().getData());
doc.add(new Field(field.name(), new SingletonTokenStream(value, pdm.getPropertyType())));
doc.add(new Field(FieldNames.AGGREGATED_NODE_UUID, false, parent.getNodeId().toString(), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
if (pdm.getPropertyType() == PropertyType.STRING) {
// add to fulltext index
Field ft = new Field(FieldNames.FULLTEXT, false, rawValue, Field.Store.YES, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.NO);
doc.add(ft);
}
}
}
} finally {
Util.disposeDocument(aDoc);
}
}
}
// only use first aggregate definition that matches
if (ruleMatched) {
break;
}
}
} catch (NoSuchItemStateException e) {
// do not fail if aggregate cannot be created
log.info("Exception while building indexing aggregate for {}. Node is not available {}.", state.getNodeId(), e.getMessage());
} catch (Exception e) {
// do not fail if aggregate cannot be created
log.warn("Exception while building indexing aggregate for " + state.getNodeId(), e);
}
}
}
use of org.apache.lucene.analysis.tokenattributes.PayloadAttribute in project jackrabbit by apache.
the class AbstractIndex method getFinishedDocument.
/**
* Returns a document that is finished with text extraction and is ready to
* be added to the index.
*
* @param doc the document to check.
* @return <code>doc</code> if it is finished already or a stripped down
* copy of <code>doc</code> without text extractors.
* @throws IOException if the document cannot be added to the indexing
* queue.
*/
private Document getFinishedDocument(Document doc) throws IOException {
if (!Util.isDocumentReady(doc)) {
Document copy = new Document();
// mark the document that reindexing is required
copy.add(new Field(FieldNames.REINDEXING_REQUIRED, false, "", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
for (Fieldable f : doc.getFields()) {
Fieldable field = null;
Field.TermVector tv = getTermVectorParameter(f);
Field.Store stored = f.isStored() ? Field.Store.YES : Field.Store.NO;
Field.Index indexed = getIndexParameter(f);
if (f instanceof LazyTextExtractorField || f.readerValue() != null) {
// replace all readers with empty string reader
field = new Field(f.name(), new StringReader(""), tv);
} else if (f.stringValue() != null) {
field = new Field(f.name(), false, f.stringValue(), stored, indexed, tv);
} else if (f.isBinary()) {
field = new Field(f.name(), f.getBinaryValue(), stored);
} else if (f.tokenStreamValue() != null && f.tokenStreamValue() instanceof SingletonTokenStream) {
TokenStream tokenStream = f.tokenStreamValue();
TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class);
PayloadAttribute payloadAttribute = tokenStream.addAttribute(PayloadAttribute.class);
tokenStream.incrementToken();
String value = new String(termAttribute.termBuffer(), 0, termAttribute.termLength());
tokenStream.reset();
field = new Field(f.name(), new SingletonTokenStream(value, (Payload) payloadAttribute.getPayload().clone()));
}
if (field != null) {
field.setOmitNorms(f.getOmitNorms());
copy.add(field);
}
}
// schedule the original document for later indexing
Document existing = indexingQueue.addDocument(doc);
if (existing != null) {
// the queue already contained a pending document for this
// node. -> dispose the document
Util.disposeDocument(existing);
}
// use the stripped down copy for now
doc = copy;
}
return doc;
}
Aggregations