use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project languagetool by languagetool-org.
the class LanguageToolFilterTest method displayTokensWithFullDetails.
private static void displayTokensWithFullDetails(TokenStream stream) throws IOException {
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
TypeAttribute type = stream.addAttribute(TypeAttribute.class);
int position = 0;
while (stream.incrementToken()) {
int increment = posIncr.getPositionIncrement();
if (increment > 0) {
position = position + increment;
System.out.println();
System.out.print(position + ": ");
}
System.out.print("[" + term + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + "] ");
}
System.out.println();
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project Solbase by Photobucket.
the class IndexWriter method parseDoc.
@SuppressWarnings("unchecked")
public ParsedDoc parseDoc(Document doc, Analyzer analyzer, String indexName, int docNumber, List<String> sortFieldNames) throws CorruptIndexException, IOException {
// given doc, what are all of terms we indexed
List<Term> allIndexedTerms = new ArrayList<Term>();
Map<String, byte[]> fieldCache = new HashMap<String, byte[]>(1024);
// need to hold onto TermDocMetaData, so it can return this array
List<TermDocMetadata> metadatas = new ArrayList<TermDocMetadata>();
byte[] docId = Bytes.toBytes(docNumber);
int position = 0;
for (Fieldable field : (List<Fieldable>) doc.getFields()) {
// Indexed field
if (field.isIndexed() && field.isTokenized()) {
TokenStream tokens = field.tokenStreamValue();
if (tokens == null) {
tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
}
// collect term information per field
Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new ConcurrentSkipListMap<Term, Map<ByteBuffer, List<Number>>>();
int lastOffset = 0;
if (position > 0) {
position += analyzer.getPositionIncrementGap(field.name());
}
// reset the TokenStream to the first token
tokens.reset();
// offsets
OffsetAttribute offsetAttribute = null;
if (field.isStoreOffsetWithTermVector())
offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);
// positions
PositionIncrementAttribute posIncrAttribute = null;
if (field.isStorePositionWithTermVector())
posIncrAttribute = (PositionIncrementAttribute) tokens.addAttribute(PositionIncrementAttribute.class);
TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class);
// store normalizations of field per term per document
// rather
// than per field.
// this adds more to write but less to read on other side
Integer tokensInField = new Integer(0);
while (tokens.incrementToken()) {
tokensInField++;
Term term = new Term(field.name(), termAttribute.term());
allIndexedTerms.add(term);
// fetch all collected information for this term
Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term);
if (termInfo == null) {
termInfo = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
allTermInformation.put(term, termInfo);
}
// term frequency
List<Number> termFrequency = termInfo.get(TermDocMetadata.termFrequencyKeyBytes);
if (termFrequency == null) {
termFrequency = new ArrayList<Number>();
termFrequency.add(new Integer(0));
termInfo.put(TermDocMetadata.termFrequencyKeyBytes, termFrequency);
}
// increment
termFrequency.set(0, termFrequency.get(0).intValue() + 1);
// position vector
if (field.isStorePositionWithTermVector()) {
position += (posIncrAttribute.getPositionIncrement() - 1);
List<Number> positionVector = termInfo.get(TermDocMetadata.positionVectorKeyBytes);
if (positionVector == null) {
positionVector = new ArrayList<Number>();
termInfo.put(TermDocMetadata.positionVectorKeyBytes, positionVector);
}
positionVector.add(++position);
}
// term offsets
if (field.isStoreOffsetWithTermVector()) {
List<Number> offsetVector = termInfo.get(TermDocMetadata.offsetVectorKeyBytes);
if (offsetVector == null) {
offsetVector = new ArrayList<Number>();
termInfo.put(TermDocMetadata.offsetVectorKeyBytes, offsetVector);
}
offsetVector.add(lastOffset + offsetAttribute.startOffset());
offsetVector.add(lastOffset + offsetAttribute.endOffset());
}
List<Number> sortValues = new ArrayList<Number>();
// init sortValues
for (int i = 0; i < Scorer.numSort; i++) {
sortValues.add(new Integer(-1));
}
int order = 0;
// extract sort field value and store it in term doc metadata obj
for (String fieldName : sortFieldNames) {
Fieldable fieldable = doc.getFieldable(fieldName);
if (fieldable instanceof EmbeddedSortField) {
EmbeddedSortField sortField = (EmbeddedSortField) fieldable;
int value = -1;
if (sortField.stringValue() != null) {
value = Integer.parseInt(sortField.stringValue());
}
int sortSlot = sortField.getSortSlot();
sortValues.set(sortSlot - 1, new Integer(value));
} else {
// TODO: this logic is used for real time indexing.
// hacky. depending on order of sort field names in array
int value = -1;
if (fieldable.stringValue() != null) {
value = Integer.parseInt(fieldable.stringValue());
}
sortValues.set(order++, new Integer(value));
}
}
termInfo.put(TermDocMetadata.sortFieldKeyBytes, sortValues);
}
List<Number> bnorm = null;
if (!field.getOmitNorms()) {
bnorm = new ArrayList<Number>();
float norm = doc.getBoost();
norm *= field.getBoost();
norm *= similarity.lengthNorm(field.name(), tokensInField);
bnorm.add(Similarity.encodeNorm(norm));
}
for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet()) {
Term tempTerm = term.getKey();
byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(tempTerm);
// more writes but faster on read side.
if (!field.getOmitNorms()) {
term.getValue().put(TermDocMetadata.normsKeyBytes, bnorm);
}
TermDocMetadata data = new TermDocMetadata(docNumber, term.getValue(), fieldTermKeyBytes, tempTerm);
metadatas.add(data);
}
}
// Untokenized fields go in without a termPosition
if (field.isIndexed() && !field.isTokenized()) {
Term term = new Term(field.name(), field.stringValue());
allIndexedTerms.add(term);
byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(term);
Map<ByteBuffer, List<Number>> termMap = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
termMap.put(TermDocMetadata.termFrequencyKeyBytes, Arrays.asList(new Number[] {}));
termMap.put(TermDocMetadata.positionVectorKeyBytes, Arrays.asList(new Number[] {}));
TermDocMetadata data = new TermDocMetadata(docNumber, termMap, fieldTermKeyBytes, term);
metadatas.add(data);
}
// Stores each field as a column under this doc key
if (field.isStored()) {
byte[] _value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue());
// first byte flags if binary or not
byte[] value = new byte[_value.length + 1];
System.arraycopy(_value, 0, value, 0, _value.length);
value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE);
// logic to handle multiple fields w/ same name
byte[] currentValue = fieldCache.get(field.name());
if (currentValue == null) {
fieldCache.put(field.name(), value);
} else {
// append new data
byte[] newValue = new byte[currentValue.length + SolbaseUtil.delimiter.length + value.length - 1];
System.arraycopy(currentValue, 0, newValue, 0, currentValue.length - 1);
System.arraycopy(SolbaseUtil.delimiter, 0, newValue, currentValue.length - 1, SolbaseUtil.delimiter.length);
System.arraycopy(value, 0, newValue, currentValue.length + SolbaseUtil.delimiter.length - 1, value.length);
fieldCache.put(field.name(), newValue);
}
}
}
Put documentPut = new Put(SolbaseUtil.randomize(docNumber));
// Store each field as a column under this docId
for (Map.Entry<String, byte[]> field : fieldCache.entrySet()) {
documentPut.add(Bytes.toBytes("field"), Bytes.toBytes(field.getKey()), field.getValue());
}
// in case of real time update, we need to add back docId field
if (!documentPut.has(Bytes.toBytes("field"), Bytes.toBytes("docId"))) {
byte[] docIdStr = Bytes.toBytes(new Integer(docNumber).toString());
// first byte flags if binary or not
byte[] value = new byte[docIdStr.length + 1];
System.arraycopy(docIdStr, 0, value, 0, docIdStr.length);
value[value.length - 1] = (byte) (Byte.MIN_VALUE);
documentPut.add(Bytes.toBytes("field"), Bytes.toBytes("docId"), value);
}
// Finally, Store meta-data so we can delete this document
documentPut.add(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"), SolbaseUtil.toBytes(allIndexedTerms).array());
ParsedDoc parsedDoc = new ParsedDoc(metadatas, doc, documentPut, fieldCache.entrySet(), allIndexedTerms);
return parsedDoc;
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project zm-mailbox by Zimbra.
the class UniversalAnalyzerTest method testSTD.
private void testSTD(String src) throws IOException {
TokenStream std = standardAnalyzer.tokenStream(null, new StringReader(src));
CharTermAttribute stdTermAttr = std.addAttribute(CharTermAttribute.class);
OffsetAttribute stdOffsetAttr = std.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute stdPosIncAttr = std.addAttribute(PositionIncrementAttribute.class);
TokenStream uni = universalAnalyzer.tokenStream(null, new StringReader(src));
CharTermAttribute uniTermAttr = uni.addAttribute(CharTermAttribute.class);
OffsetAttribute uniOffsetAttr = uni.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute uniPosIncAttr = uni.addAttribute(PositionIncrementAttribute.class);
while (true) {
boolean result = std.incrementToken();
Assert.assertEquals(result, uni.incrementToken());
if (!result) {
break;
}
String term = stdTermAttr.toString();
Assert.assertEquals(stdTermAttr, uniTermAttr);
if (assertOffset) {
Assert.assertEquals(term, stdOffsetAttr, uniOffsetAttr);
}
Assert.assertEquals(term, stdPosIncAttr, uniPosIncAttr);
}
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.
the class SpellCheckComponent method getTokens.
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
Collection<Token> result = new ArrayList<>();
assert analyzer != null;
try (TokenStream ts = analyzer.tokenStream("", q)) {
ts.reset();
// TODO: support custom attributes
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class);
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
while (ts.incrementToken()) {
Token token = new Token();
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
token.setType(typeAtt.type());
token.setFlags(flagsAtt.getFlags());
token.setPayload(payloadAtt.getPayload());
token.setPositionIncrement(posIncAtt.getPositionIncrement());
result.add(token);
}
ts.end();
return result;
}
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.
the class SimplePreAnalyzedParser method toFormattedString.
@Override
public String toFormattedString(Field f) throws IOException {
StringBuilder sb = new StringBuilder();
sb.append(VERSION + " ");
if (f.fieldType().stored()) {
String s = f.stringValue();
if (s != null) {
// encode the equals sign
s = s.replaceAll("=", "\\=");
sb.append('=');
sb.append(s);
sb.append('=');
}
}
TokenStream ts = f.tokenStreamValue();
if (ts != null) {
StringBuilder tok = new StringBuilder();
boolean next = false;
while (ts.incrementToken()) {
if (next) {
sb.append(' ');
} else {
next = true;
}
tok.setLength(0);
Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
String cTerm = null;
String tTerm = null;
while (it.hasNext()) {
Class<? extends Attribute> cl = it.next();
Attribute att = ts.getAttribute(cl);
if (att == null) {
continue;
}
if (cl.isAssignableFrom(CharTermAttribute.class)) {
CharTermAttribute catt = (CharTermAttribute) att;
cTerm = escape(catt.buffer(), catt.length());
} else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray();
tTerm = escape(tTermChars, tTermChars.length);
} else {
if (tok.length() > 0)
tok.append(',');
if (cl.isAssignableFrom(FlagsAttribute.class)) {
tok.append("f=" + Integer.toHexString(((FlagsAttribute) att).getFlags()));
} else if (cl.isAssignableFrom(OffsetAttribute.class)) {
tok.append("s=" + ((OffsetAttribute) att).startOffset() + ",e=" + ((OffsetAttribute) att).endOffset());
} else if (cl.isAssignableFrom(PayloadAttribute.class)) {
BytesRef p = ((PayloadAttribute) att).getPayload();
if (p != null && p.length > 0) {
tok.append("p=" + bytesToHex(p.bytes, p.offset, p.length));
} else if (tok.length() > 0) {
// remove the last comma
tok.setLength(tok.length() - 1);
}
} else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
tok.append("i=" + ((PositionIncrementAttribute) att).getPositionIncrement());
} else if (cl.isAssignableFrom(TypeAttribute.class)) {
tok.append("y=" + escape(((TypeAttribute) att).type()));
} else {
tok.append(cl.getName() + "=" + escape(att.toString()));
}
}
}
String term = null;
if (cTerm != null) {
term = cTerm;
} else {
term = tTerm;
}
if (term != null && term.length() > 0) {
if (tok.length() > 0) {
tok.insert(0, term + ",");
} else {
tok.insert(0, term);
}
}
sb.append(tok);
}
}
return sb.toString();
}
Aggregations