use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project jackrabbit by apache.
the class AbstractExcerpt method createTermPositionVector.
/**
* @param text the text.
* @return a <code>TermPositionVector</code> for the given text.
*/
private TermPositionVector createTermPositionVector(String text) {
// term -> TermVectorOffsetInfo[]
final SortedMap<String, TermVectorOffsetInfo[]> termMap = new TreeMap<String, TermVectorOffsetInfo[]>();
Reader r = new StringReader(text);
TokenStream ts = index.getTextAnalyzer().tokenStream("", r);
try {
while (ts.incrementToken()) {
OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
TermAttribute term = ts.getAttribute(TermAttribute.class);
String termText = term.term();
TermVectorOffsetInfo[] info = termMap.get(termText);
if (info == null) {
info = new TermVectorOffsetInfo[1];
} else {
TermVectorOffsetInfo[] tmp = info;
info = new TermVectorOffsetInfo[tmp.length + 1];
System.arraycopy(tmp, 0, info, 0, tmp.length);
}
info[info.length - 1] = new TermVectorOffsetInfo(offset.startOffset(), offset.endOffset());
termMap.put(termText, info);
}
ts.end();
ts.close();
} catch (IOException e) {
// should never happen, we are reading from a string
}
return new TermPositionVector() {
private String[] terms = (String[]) termMap.keySet().toArray(new String[termMap.size()]);
public int[] getTermPositions(int index) {
return null;
}
public TermVectorOffsetInfo[] getOffsets(int index) {
TermVectorOffsetInfo[] info = TermVectorOffsetInfo.EMPTY_OFFSET_INFO;
if (index >= 0 && index < terms.length) {
info = termMap.get(terms[index]);
}
return info;
}
public String getField() {
return "";
}
public int size() {
return terms.length;
}
public String[] getTerms() {
return terms;
}
public int[] getTermFrequencies() {
int[] freqs = new int[terms.length];
for (int i = 0; i < terms.length; i++) {
freqs[i] = termMap.get(terms[i]).length;
}
return freqs;
}
public int indexOf(String term) {
int res = Arrays.binarySearch(terms, term);
return res >= 0 ? res : -1;
}
public int[] indexesOf(String[] terms, int start, int len) {
int[] res = new int[len];
for (int i = 0; i < len; i++) {
res[i] = indexOf(terms[i]);
}
return res;
}
};
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project jackrabbit-oak by apache.
the class LuceneIndex method tokenize.
/**
* Tries to merge back tokens that are split on relevant fulltext query
* wildcards ('*' or '?')
*
*
* @param text
* @param analyzer
* @return
*/
static List<String> tokenize(String text, Analyzer analyzer) {
List<String> tokens = new ArrayList<String>();
TokenStream stream = null;
try {
stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text));
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
// TypeAttribute type = stream.addAttribute(TypeAttribute.class);
stream.reset();
int poz = 0;
boolean hasFulltextToken = false;
StringBuilder token = new StringBuilder();
while (stream.incrementToken()) {
String term = termAtt.toString();
int start = offsetAtt.startOffset();
int end = offsetAtt.endOffset();
if (start > poz) {
for (int i = poz; i < start; i++) {
for (char c : fulltextTokens) {
if (c == text.charAt(i)) {
token.append(c);
hasFulltextToken = true;
}
}
}
}
poz = end;
if (hasFulltextToken) {
token.append(term);
hasFulltextToken = false;
} else {
if (token.length() > 0) {
tokens.add(token.toString());
}
token = new StringBuilder();
token.append(term);
}
}
// consume to the end of the string
if (poz < text.length()) {
for (int i = poz; i < text.length(); i++) {
for (char c : fulltextTokens) {
if (c == text.charAt(i)) {
token.append(c);
}
}
}
}
if (token.length() > 0) {
tokens.add(token.toString());
}
stream.end();
} catch (IOException e) {
LOG.error("Building fulltext query failed", e.getMessage());
return null;
} finally {
try {
if (stream != null) {
stream.close();
}
} catch (IOException e) {
// ignore
}
}
return tokens;
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class ShingleFilter method getNextToken.
/**
* <p>Get the next token from the input stream.
* <p>If the next token has <code>positionIncrement > 1</code>,
* <code>positionIncrement - 1</code> {@link #fillerToken}s are
* inserted first.
* @param target Where to put the new token; if null, a new instance is created.
* @return On success, the populated token; null otherwise
* @throws IOException if the input stream has a problem
*/
private InputWindowToken getNextToken(InputWindowToken target) throws IOException {
InputWindowToken newTarget = target;
if (numFillerTokensToInsert > 0) {
if (null == target) {
newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
} else {
nextInputStreamToken.copyTo(target.attSource);
}
// A filler token occupies no space
newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.startOffset());
newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else if (isNextInputStreamToken) {
if (null == target) {
newTarget = new InputWindowToken(nextInputStreamToken.cloneAttributes());
} else {
nextInputStreamToken.copyTo(target.attSource);
}
isNextInputStreamToken = false;
newTarget.isFiller = false;
} else if (!exhausted) {
if (input.incrementToken()) {
if (null == target) {
newTarget = new InputWindowToken(cloneAttributes());
} else {
this.copyTo(target.attSource);
}
if (posIncrAtt.getPositionIncrement() > 1) {
// Each output shingle must contain at least one input token,
// so no more than (maxShingleSize - 1) filler tokens will be inserted.
numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
// Save the current token as the next input stream token
if (null == nextInputStreamToken) {
nextInputStreamToken = cloneAttributes();
} else {
this.copyTo(nextInputStreamToken);
}
isNextInputStreamToken = true;
// A filler token occupies no space
newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
} else {
newTarget.isFiller = false;
}
} else {
exhausted = true;
input.end();
endState = captureState();
numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement(), maxShingleSize - 1);
if (numFillerTokensToInsert > 0) {
nextInputStreamToken = new AttributeSource(getAttributeFactory());
nextInputStreamToken.addAttribute(CharTermAttribute.class);
OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(OffsetAttribute.class);
newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset());
// Recurse/loop just once:
return getNextToken(target);
} else {
newTarget = null;
}
}
} else {
newTarget = null;
}
return newTarget;
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class EdgeNGramTokenFilterTest method testSupplementaryCharacters.
public void testSupplementaryCharacters() throws IOException {
final String s = TestUtil.randomUnicodeString(random(), 10);
final int codePointCount = s.codePointCount(0, s.length());
final int minGram = TestUtil.nextInt(random(), 1, 3);
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
TokenStream tk = new KeywordTokenizer();
((Tokenizer) tk).setReader(new StringReader(s));
tk = new EdgeNGramTokenFilter(tk, minGram, maxGram);
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
tk.reset();
for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
final int end = Character.offsetByCodePoints(s, 0, i);
assertEquals(s.substring(0, end), termAtt.toString());
}
assertFalse(tk.incrementToken());
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class NGramTokenFilterTest method testSupplementaryCharacters.
public void testSupplementaryCharacters() throws IOException {
final String s = TestUtil.randomUnicodeString(random(), 10);
final int codePointCount = s.codePointCount(0, s.length());
final int minGram = TestUtil.nextInt(random(), 1, 3);
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
TokenStream tk = new KeywordTokenizer();
((Tokenizer) tk).setReader(new StringReader(s));
tk = new NGramTokenFilter(tk, minGram, maxGram);
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
tk.reset();
for (int start = 0; start < codePointCount; ++start) {
for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
final int startIndex = Character.offsetByCodePoints(s, 0, start);
final int endIndex = Character.offsetByCodePoints(s, 0, end);
assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
}
}
assertFalse(tk.incrementToken());
}
Aggregations