use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project OpenGrok by OpenGrok.
the class CustomAssertions method assertSymbolStream.
/**
* Asserts the specified tokenizer class produces an expected stream of
* symbols from the specified input.
* @param klass the test class
* @param iss the input stream
* @param expectedTokens the expected, ordered token list
* @throws java.lang.Exception if an error occurs constructing a
* {@code klass} instance or testing the stream
*/
public static void assertSymbolStream(Class<? extends JFlexSymbolMatcher> klass, InputStream iss, List<String> expectedTokens) throws Exception {
byte[] inputCopy = copyStream(iss);
String input = new String(inputCopy, StandardCharsets.UTF_8);
JFlexTokenizer tokenizer = new JFlexTokenizer(klass.getConstructor(Reader.class).newInstance(new InputStreamReader(new ByteArrayInputStream(inputCopy), StandardCharsets.UTF_8)));
CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
OffsetAttribute offs = tokenizer.addAttribute(OffsetAttribute.class);
int count = 0;
List<String> tokens = new ArrayList<>();
while (tokenizer.incrementToken()) {
String termValue = term.toString();
tokens.add(termValue);
String cutValue = input.substring(offs.startOffset(), offs.endOffset());
assertEquals("cut term" + (1 + count), cutValue, termValue);
++count;
}
count = 0;
for (String token : tokens) {
// 1-based offset to accord with line #
if (count >= expectedTokens.size()) {
printTokens(tokens);
assertTrue("too many tokens at term" + (1 + count) + ": " + token, count < expectedTokens.size());
}
String expected = expectedTokens.get(count);
if (!token.equals(expected)) {
printTokens(tokens);
assertEquals("term" + (1 + count), expected, token);
}
count++;
}
assertEquals("wrong number of tokens", expectedTokens.size(), count);
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project SearchServices by Alfresco.
the class PathTokenFilterTest method testAttributesAfterStreamEnd.
public void testAttributesAfterStreamEnd() throws IOException {
final String path = "uri1:one";
StringReader reader = new StringReader(path);
PathTokenFilter ts = new PathTokenFilter(PathTokenFilter.PATH_SEPARATOR, PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT, PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true);
ts.setReader(reader);
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
// PathTokenFilter.end() will be called after all tokens consumed.
tokenise(ts, new String[] { "uri1", "one" });
// Check attributes cleaned up
assertEquals("", termAtt.toString());
// the default
assertEquals("word", typeAtt.type());
assertEquals(0, posIncAtt.getPositionIncrement());
// Final offset...
assertEquals(path.length(), offsetAtt.startOffset());
assertEquals(path.length(), offsetAtt.endOffset());
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project SearchServices by Alfresco.
the class Solr4QueryParser method getToken.
protected String getToken(String field, String value, AnalysisMode analysisMode) throws ParseException {
try (TokenStream source = getAnalyzer().tokenStream(field, new StringReader(value))) {
String tokenised = null;
while (source.incrementToken()) {
CharTermAttribute cta = source.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = null;
if (source.hasAttribute(TypeAttribute.class)) {
typeAtt = source.getAttribute(TypeAttribute.class);
}
PositionIncrementAttribute posIncAtt = null;
if (source.hasAttribute(PositionIncrementAttribute.class)) {
posIncAtt = source.getAttribute(PositionIncrementAttribute.class);
}
PackedTokenAttributeImpl token = new PackedTokenAttributeImpl();
token.setEmpty().copyBuffer(cta.buffer(), 0, cta.length());
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
if (typeAtt != null) {
token.setType(typeAtt.type());
}
if (posIncAtt != null) {
token.setPositionIncrement(posIncAtt.getPositionIncrement());
}
tokenised = token.toString();
}
return tokenised;
} catch (IOException e) {
throw new ParseException("IO" + e.getMessage());
}
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project SearchServices by Alfresco.
the class Solr4QueryParser method getFirstTokenForRange.
private String getFirstTokenForRange(String string, FieldInstance field) throws IOException {
PackedTokenAttributeImpl nextToken;
TokenStream source = null;
;
try {
source = getAnalyzer().tokenStream(field.getField(), new StringReader(string));
source.reset();
while (source.incrementToken()) {
CharTermAttribute cta = source.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = source.getAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = null;
if (source.hasAttribute(TypeAttribute.class)) {
typeAtt = source.getAttribute(TypeAttribute.class);
}
PositionIncrementAttribute posIncAtt = null;
if (source.hasAttribute(PositionIncrementAttribute.class)) {
posIncAtt = source.getAttribute(PositionIncrementAttribute.class);
}
nextToken = new PackedTokenAttributeImpl();
nextToken.setEmpty().copyBuffer(cta.buffer(), 0, cta.length());
nextToken.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
if (typeAtt != null) {
nextToken.setType(typeAtt.type());
}
if (posIncAtt != null) {
nextToken.setPositionIncrement(posIncAtt.getPositionIncrement());
}
return nextToken.toString();
}
} finally {
try {
if (source != null) {
source.close();
}
} catch (IOException e) {
// ignore
}
}
return null;
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project epadd by ePADD.
the class Highlighter method dumpTokenStream.
/**
* @param content - text to be highlighted
* @param term - This can be a generic query passed to the Lucene search, for example: elate|happy|invite, hope, "Robert Creeley", /guth.+/ , /[0-9\\-]*[0-9]{3}[- ][0-9]{2}[- ][0-9]{4}[0-9\\-]+/ are all valid terms
* @param preTag - HTML pre-tag, for ex: <B>
* @param postTag - HTML post-tag, for ex: </B>
* The highlighted content would have [pre Tag] matching term [post tag]
* When the term is "Robert Creeley", the output is "On Tue, Jun 24, 2014 at 11:56 AM, [preTag]Robert Creeley's[postTag] <creeley@acsu.buffalo.edu> wrote:"
*/
/**
* debug method only
*/
private static String dumpTokenStream(Analyzer analyzer, TokenStream tokenStream) throws IOException {
// taken from https://stackoverflow.com/questions/2638200/how-to-get-a-token-from-a-lucene-tokenstream
OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
StringBuilder sb = new StringBuilder();
sb.append("Tokens:\n");
tokenStream.reset();
while (tokenStream.incrementToken()) {
int startOffset = offsetAttribute.startOffset();
int endOffset = offsetAttribute.endOffset();
String term = charTermAttribute.toString();
sb.append(term + "(offsets: " + startOffset + ", " + endOffset + ")\n");
}
tokenStream.reset();
return sb.toString();
}
Aggregations