use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class TestCharTokenizers method testCrossPlaneNormalization.
// LUCENE-3642: normalize SMP->BMP and check that offsets are correct
public void testCrossPlaneNormalization() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()) {
@Override
protected int normalize(int c) {
if (c > 0xffff) {
return 'δ';
} else {
return c;
}
}
};
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
int num = 1000 * RANDOM_MULTIPLIER;
for (int i = 0; i < num; i++) {
String s = TestUtil.randomUnicodeString(random());
try (TokenStream ts = analyzer.tokenStream("foo", s)) {
ts.reset();
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
while (ts.incrementToken()) {
String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
cp = highlightedText.codePointAt(j);
assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
}
}
ts.end();
}
}
// just for fun
checkRandomData(random(), analyzer, num);
analyzer.close();
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project textdb by TextDB.
the class DataflowUtils method generatePayload.
public static List<Span> generatePayload(String attributeName, String fieldValue, Analyzer luceneAnalyzer) {
List<Span> payload = new ArrayList<>();
try {
TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(fieldValue));
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);
int tokenPositionCounter = -1;
tokenStream.reset();
while (tokenStream.incrementToken()) {
tokenPositionCounter += positionIncrementAttribute.getPositionIncrement();
int tokenPosition = tokenPositionCounter;
int charStart = offsetAttribute.startOffset();
int charEnd = offsetAttribute.endOffset();
String analyzedTermStr = charTermAttribute.toString();
String originalTermStr = fieldValue.substring(charStart, charEnd);
payload.add(new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition));
}
tokenStream.close();
} catch (IOException e) {
// return empty payload
payload.clear();
}
return payload;
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.
the class AnalyzingInfixSuggesterTest method testHighlightAsObject.
@SuppressWarnings("unchecked")
public void testHighlightAsObject() throws Exception {
Input[] keys = new Input[] { new Input("a penny saved is a penny earned", 10, new BytesRef("foobaz")) };
Analyzer a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(newDirectory(), a, a, 3, false) {
@Override
protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
ts.reset();
List<LookupHighlightFragment> fragments = new ArrayList<>();
int upto = 0;
while (ts.incrementToken()) {
String token = termAtt.toString();
int startOffset = offsetAtt.startOffset();
int endOffset = offsetAtt.endOffset();
if (upto < startOffset) {
fragments.add(new LookupHighlightFragment(text.substring(upto, startOffset), false));
upto = startOffset;
} else if (upto > startOffset) {
continue;
}
if (matchedTokens.contains(token)) {
// Token matches.
fragments.add(new LookupHighlightFragment(text.substring(startOffset, endOffset), true));
upto = endOffset;
} else if (prefixToken != null && token.startsWith(prefixToken)) {
fragments.add(new LookupHighlightFragment(text.substring(startOffset, startOffset + prefixToken.length()), true));
if (prefixToken.length() < token.length()) {
fragments.add(new LookupHighlightFragment(text.substring(startOffset + prefixToken.length(), startOffset + token.length()), false));
}
upto = endOffset;
}
}
ts.end();
int endOffset = offsetAtt.endOffset();
if (upto < endOffset) {
fragments.add(new LookupHighlightFragment(text.substring(upto), false));
}
return fragments;
}
}
};
suggester.build(new InputArrayIterator(keys));
List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("ear", random()), 10, true, true);
assertEquals(1, results.size());
assertEquals("a penny saved is a penny <b>ear</b>ned", toString((List<LookupHighlightFragment>) results.get(0).highlightKey));
assertEquals(10, results.get(0).value);
assertEquals(new BytesRef("foobaz"), results.get(0).payload);
suggester.close();
a.close();
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project OpenGrok by OpenGrok.
the class DefinitionsTokenStreamTest method testDefinitionsVsContent.
private void testDefinitionsVsContent(boolean expandTabs, String sourceResource, String tagsResource, int expectedCount, boolean doSupplement, Map<Integer, SimpleEntry<String, String>> overrides) throws IOException {
StreamSource src = getSourceFromResource(sourceResource);
// Deserialize the ctags.
int tabSize = expandTabs ? 8 : 0;
String suppResource = doSupplement ? sourceResource : null;
Definitions defs = StreamUtils.readTagsFromResource(tagsResource, suppResource, tabSize);
// Read the whole input.
StringBuilder bld = new StringBuilder();
String source;
try (Reader rdr = ExpandTabsReader.wrap(IOUtils.createBOMStrippedReader(src.getStream(), StandardCharsets.UTF_8.name()), tabSize)) {
int c;
while ((c = rdr.read()) != -1) {
bld.append((char) c);
}
source = bld.toString();
}
// Deserialize the token stream.
DefinitionsTokenStream tokstream = new DefinitionsTokenStream();
tokstream.initialize(defs, src, (in) -> {
return ExpandTabsReader.wrap(in, tabSize);
});
// Iterate through stream.
CharTermAttribute term = tokstream.getAttribute(CharTermAttribute.class);
assertNotNull("CharTermAttribute", term);
OffsetAttribute offs = tokstream.getAttribute(OffsetAttribute.class);
assertNotNull("OffsetAttribute", offs);
int count = 0;
while (tokstream.incrementToken()) {
++count;
String termValue = term.toString();
String cutValue = source.substring(offs.startOffset(), offs.endOffset());
// If an override exists, test it specially.
if (overrides != null && overrides.containsKey(count)) {
SimpleEntry<String, String> overkv = overrides.get(count);
assertEquals("cut term override" + count, overkv.getKey(), cutValue);
assertEquals("cut term w.r.t. term override" + count, overkv.getValue(), termValue);
continue;
}
boolean cutContainsTerm = cutValue.endsWith(termValue);
assertTrue("cut term" + count + " at " + (offs.startOffset()) + "-" + (offs.endOffset()) + "[" + cutValue + "] vs [" + termValue + "]", cutContainsTerm);
}
assertEquals("token count", expectedCount, count);
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project OpenGrok by OpenGrok.
the class PathTokenizerTest method testIncrementToken.
/**
* Test of incrementToken method, of class PathTokenizer.
*/
@Test
public void testIncrementToken() throws Exception {
String inputText = "alpha/beta/gamma/delta.ext";
String[] expectedTokens = inputText.split("[/.]");
PathTokenizer tokenizer = new PathTokenizer();
tokenizer.setReader(new StringReader(inputText));
CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class);
int count = 0;
int dots = 0;
tokenizer.reset();
while (tokenizer.incrementToken()) {
if (term.toString().equals(".")) {
dots++;
break;
}
assertTrue("too many tokens", count < expectedTokens.length);
String expected = expectedTokens[count];
assertEquals("term", expected, term.toString());
assertEquals("start", inputText.indexOf(expected), offset.startOffset());
assertEquals("end", inputText.indexOf(expected) + expected.length(), offset.endOffset());
count++;
}
tokenizer.end();
tokenizer.close();
assertEquals("wrong number of tokens", expectedTokens.length, count + dots);
}
Aggregations