use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project elasticsearch by elastic.
the class MapperQueryParser method getPossiblyAnalyzedPrefixQuery.
private Query getPossiblyAnalyzedPrefixQuery(String field, String termStr) throws ParseException {
if (!settings.analyzeWildcard()) {
return super.getPrefixQuery(field, termStr);
}
List<List<String>> tlist;
// get Analyzer from superclass and tokenize the term
TokenStream source = null;
try {
try {
source = getAnalyzer().tokenStream(field, termStr);
source.reset();
} catch (IOException e) {
return super.getPrefixQuery(field, termStr);
}
tlist = new ArrayList<>();
List<String> currentPos = new ArrayList<>();
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posAtt = source.addAttribute(PositionIncrementAttribute.class);
while (true) {
try {
if (!source.incrementToken())
break;
} catch (IOException e) {
break;
}
if (currentPos.isEmpty() == false && posAtt.getPositionIncrement() > 0) {
tlist.add(currentPos);
currentPos = new ArrayList<>();
}
currentPos.add(termAtt.toString());
}
if (currentPos.isEmpty() == false) {
tlist.add(currentPos);
}
} finally {
if (source != null) {
IOUtils.closeWhileHandlingException(source);
}
}
if (tlist.size() == 0) {
return null;
}
if (tlist.size() == 1 && tlist.get(0).size() == 1) {
return super.getPrefixQuery(field, tlist.get(0).get(0));
}
// build a boolean query with prefix on the last position only.
List<BooleanClause> clauses = new ArrayList<>();
for (int pos = 0; pos < tlist.size(); pos++) {
List<String> plist = tlist.get(pos);
boolean isLastPos = (pos == tlist.size() - 1);
Query posQuery;
if (plist.size() == 1) {
if (isLastPos) {
posQuery = super.getPrefixQuery(field, plist.get(0));
} else {
posQuery = newTermQuery(new Term(field, plist.get(0)));
}
} else if (isLastPos == false) {
// build a synonym query for terms in the same position.
Term[] terms = new Term[plist.size()];
for (int i = 0; i < plist.size(); i++) {
terms[i] = new Term(field, plist.get(i));
}
posQuery = new SynonymQuery(terms);
} else {
List<BooleanClause> innerClauses = new ArrayList<>();
for (String token : plist) {
innerClauses.add(new BooleanClause(super.getPrefixQuery(field, token), BooleanClause.Occur.SHOULD));
}
posQuery = getBooleanQueryCoordDisabled(innerClauses);
}
clauses.add(new BooleanClause(posQuery, getDefaultOperator() == Operator.AND ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD));
}
return getBooleanQuery(clauses);
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project elasticsearch by elastic.
the class TokenCountFieldMapper method countPositions.
/**
* Count position increments in a token stream. Package private for testing.
* @param analyzer analyzer to create token stream
* @param fieldName field name to pass to analyzer
* @param fieldValue field value to pass to analyzer
* @return number of position increments in a token stream
* @throws IOException if tokenStream throws it
*/
static int countPositions(Analyzer analyzer, String fieldName, String fieldValue) throws IOException {
try (TokenStream tokenStream = analyzer.tokenStream(fieldName, fieldValue)) {
int count = 0;
PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
count += position.getPositionIncrement();
}
tokenStream.end();
count += position.getPositionIncrement();
return count;
}
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project elasticsearch by elastic.
the class TransportAnalyzeAction method simpleAnalyze.
private static List<AnalyzeResponse.AnalyzeToken> simpleAnalyze(AnalyzeRequest request, Analyzer analyzer, String field) {
List<AnalyzeResponse.AnalyzeToken> tokens = new ArrayList<>();
int lastPosition = -1;
int lastOffset = 0;
for (String text : request.text()) {
try (TokenStream stream = analyzer.tokenStream(field, text)) {
stream.reset();
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
TypeAttribute type = stream.addAttribute(TypeAttribute.class);
PositionLengthAttribute posLen = stream.addAttribute(PositionLengthAttribute.class);
while (stream.incrementToken()) {
int increment = posIncr.getPositionIncrement();
if (increment > 0) {
lastPosition = lastPosition + increment;
}
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), posLen.getPositionLength(), type.type(), null));
}
stream.end();
lastOffset += offset.endOffset();
lastPosition += posIncr.getPositionIncrement();
lastPosition += analyzer.getPositionIncrementGap(field);
lastOffset += analyzer.getOffsetGap(field);
} catch (IOException e) {
throw new ElasticsearchException("failed to analyze", e);
}
}
return tokens;
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project Solbase by Photobucket.
the class IndexWriter method parseDoc.
@SuppressWarnings("unchecked")
public ParsedDoc parseDoc(Document doc, Analyzer analyzer, String indexName, int docNumber, List<String> sortFieldNames) throws CorruptIndexException, IOException {
// given doc, what are all of terms we indexed
List<Term> allIndexedTerms = new ArrayList<Term>();
Map<String, byte[]> fieldCache = new HashMap<String, byte[]>(1024);
// need to hold onto TermDocMetaData, so it can return this array
List<TermDocMetadata> metadatas = new ArrayList<TermDocMetadata>();
byte[] docId = Bytes.toBytes(docNumber);
int position = 0;
for (Fieldable field : (List<Fieldable>) doc.getFields()) {
// Indexed field
if (field.isIndexed() && field.isTokenized()) {
TokenStream tokens = field.tokenStreamValue();
if (tokens == null) {
tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
}
// collect term information per field
Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new ConcurrentSkipListMap<Term, Map<ByteBuffer, List<Number>>>();
int lastOffset = 0;
if (position > 0) {
position += analyzer.getPositionIncrementGap(field.name());
}
// reset the TokenStream to the first token
tokens.reset();
// offsets
OffsetAttribute offsetAttribute = null;
if (field.isStoreOffsetWithTermVector())
offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);
// positions
PositionIncrementAttribute posIncrAttribute = null;
if (field.isStorePositionWithTermVector())
posIncrAttribute = (PositionIncrementAttribute) tokens.addAttribute(PositionIncrementAttribute.class);
TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class);
// store normalizations of field per term per document
// rather
// than per field.
// this adds more to write but less to read on other side
Integer tokensInField = new Integer(0);
while (tokens.incrementToken()) {
tokensInField++;
Term term = new Term(field.name(), termAttribute.term());
allIndexedTerms.add(term);
// fetch all collected information for this term
Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term);
if (termInfo == null) {
termInfo = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
allTermInformation.put(term, termInfo);
}
// term frequency
List<Number> termFrequency = termInfo.get(TermDocMetadata.termFrequencyKeyBytes);
if (termFrequency == null) {
termFrequency = new ArrayList<Number>();
termFrequency.add(new Integer(0));
termInfo.put(TermDocMetadata.termFrequencyKeyBytes, termFrequency);
}
// increment
termFrequency.set(0, termFrequency.get(0).intValue() + 1);
// position vector
if (field.isStorePositionWithTermVector()) {
position += (posIncrAttribute.getPositionIncrement() - 1);
List<Number> positionVector = termInfo.get(TermDocMetadata.positionVectorKeyBytes);
if (positionVector == null) {
positionVector = new ArrayList<Number>();
termInfo.put(TermDocMetadata.positionVectorKeyBytes, positionVector);
}
positionVector.add(++position);
}
// term offsets
if (field.isStoreOffsetWithTermVector()) {
List<Number> offsetVector = termInfo.get(TermDocMetadata.offsetVectorKeyBytes);
if (offsetVector == null) {
offsetVector = new ArrayList<Number>();
termInfo.put(TermDocMetadata.offsetVectorKeyBytes, offsetVector);
}
offsetVector.add(lastOffset + offsetAttribute.startOffset());
offsetVector.add(lastOffset + offsetAttribute.endOffset());
}
List<Number> sortValues = new ArrayList<Number>();
// init sortValues
for (int i = 0; i < Scorer.numSort; i++) {
sortValues.add(new Integer(-1));
}
int order = 0;
// extract sort field value and store it in term doc metadata obj
for (String fieldName : sortFieldNames) {
Fieldable fieldable = doc.getFieldable(fieldName);
if (fieldable instanceof EmbeddedSortField) {
EmbeddedSortField sortField = (EmbeddedSortField) fieldable;
int value = -1;
if (sortField.stringValue() != null) {
value = Integer.parseInt(sortField.stringValue());
}
int sortSlot = sortField.getSortSlot();
sortValues.set(sortSlot - 1, new Integer(value));
} else {
// TODO: this logic is used for real time indexing.
// hacky. depending on order of sort field names in array
int value = -1;
if (fieldable.stringValue() != null) {
value = Integer.parseInt(fieldable.stringValue());
}
sortValues.set(order++, new Integer(value));
}
}
termInfo.put(TermDocMetadata.sortFieldKeyBytes, sortValues);
}
List<Number> bnorm = null;
if (!field.getOmitNorms()) {
bnorm = new ArrayList<Number>();
float norm = doc.getBoost();
norm *= field.getBoost();
norm *= similarity.lengthNorm(field.name(), tokensInField);
bnorm.add(Similarity.encodeNorm(norm));
}
for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet()) {
Term tempTerm = term.getKey();
byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(tempTerm);
// more writes but faster on read side.
if (!field.getOmitNorms()) {
term.getValue().put(TermDocMetadata.normsKeyBytes, bnorm);
}
TermDocMetadata data = new TermDocMetadata(docNumber, term.getValue(), fieldTermKeyBytes, tempTerm);
metadatas.add(data);
}
}
// Untokenized fields go in without a termPosition
if (field.isIndexed() && !field.isTokenized()) {
Term term = new Term(field.name(), field.stringValue());
allIndexedTerms.add(term);
byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(term);
Map<ByteBuffer, List<Number>> termMap = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
termMap.put(TermDocMetadata.termFrequencyKeyBytes, Arrays.asList(new Number[] {}));
termMap.put(TermDocMetadata.positionVectorKeyBytes, Arrays.asList(new Number[] {}));
TermDocMetadata data = new TermDocMetadata(docNumber, termMap, fieldTermKeyBytes, term);
metadatas.add(data);
}
// Stores each field as a column under this doc key
if (field.isStored()) {
byte[] _value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue());
// first byte flags if binary or not
byte[] value = new byte[_value.length + 1];
System.arraycopy(_value, 0, value, 0, _value.length);
value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE);
// logic to handle multiple fields w/ same name
byte[] currentValue = fieldCache.get(field.name());
if (currentValue == null) {
fieldCache.put(field.name(), value);
} else {
// append new data
byte[] newValue = new byte[currentValue.length + SolbaseUtil.delimiter.length + value.length - 1];
System.arraycopy(currentValue, 0, newValue, 0, currentValue.length - 1);
System.arraycopy(SolbaseUtil.delimiter, 0, newValue, currentValue.length - 1, SolbaseUtil.delimiter.length);
System.arraycopy(value, 0, newValue, currentValue.length + SolbaseUtil.delimiter.length - 1, value.length);
fieldCache.put(field.name(), newValue);
}
}
}
Put documentPut = new Put(SolbaseUtil.randomize(docNumber));
// Store each field as a column under this docId
for (Map.Entry<String, byte[]> field : fieldCache.entrySet()) {
documentPut.add(Bytes.toBytes("field"), Bytes.toBytes(field.getKey()), field.getValue());
}
// in case of real time update, we need to add back docId field
if (!documentPut.has(Bytes.toBytes("field"), Bytes.toBytes("docId"))) {
byte[] docIdStr = Bytes.toBytes(new Integer(docNumber).toString());
// first byte flags if binary or not
byte[] value = new byte[docIdStr.length + 1];
System.arraycopy(docIdStr, 0, value, 0, docIdStr.length);
value[value.length - 1] = (byte) (Byte.MIN_VALUE);
documentPut.add(Bytes.toBytes("field"), Bytes.toBytes("docId"), value);
}
// Finally, Store meta-data so we can delete this document
documentPut.add(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"), SolbaseUtil.toBytes(allIndexedTerms).array());
ParsedDoc parsedDoc = new ParsedDoc(metadatas, doc, documentPut, fieldCache.entrySet(), allIndexedTerms);
return parsedDoc;
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project languagetool by languagetool-org.
the class LanguageToolFilterTest method displayTokensWithFullDetails.
private static void displayTokensWithFullDetails(TokenStream stream) throws IOException {
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
TypeAttribute type = stream.addAttribute(TypeAttribute.class);
int position = 0;
while (stream.incrementToken()) {
int increment = posIncr.getPositionIncrement();
if (increment > 0) {
position = position + increment;
System.out.println();
System.out.print(position + ": ");
}
System.out.print("[" + term + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + "] ");
}
System.out.println();
}
Aggregations