use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class PreAnalyzedFieldTest method testInvalidJson.
public void testInvalidJson() throws Exception {
PreAnalyzedField paf = new PreAnalyzedField();
paf.init(h.getCore().getLatestSchema(), Collections.emptyMap());
Analyzer preAnalyzer = paf.getIndexAnalyzer();
for (String s : invalidJson) {
TokenStream stream = null;
try {
stream = preAnalyzer.tokenStream("dummy", s);
// exception should be triggered here.
stream.reset();
fail("should fail: '" + s + "'");
} catch (Exception e) {
// expected
} finally {
if (stream != null) {
stream.close();
}
}
}
// make sure the analyzer can now handle properly formatted input
TokenStream stream = preAnalyzer.tokenStream("dummy", validJson);
CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
stream.reset();
while (stream.incrementToken()) {
assertFalse("zero-length token", termAttr.length() == 0);
}
stream.end();
stream.close();
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestStopAnalyzer method testStopList.
public void testStopList() throws IOException {
CharArraySet stopWordsSet = new CharArraySet(asSet("good", "test", "analyzer"), false);
StopAnalyzer newStop = new StopAnalyzer(stopWordsSet);
try (TokenStream stream = newStop.tokenStream("test", "This is a good test of the english stop analyzer")) {
assertNotNull(stream);
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
stream.reset();
while (stream.incrementToken()) {
String text = termAtt.toString();
assertFalse(stopWordsSet.contains(text));
}
stream.end();
}
newStop.close();
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestTypeTokenFilter method testPositons.
private void testPositons(TypeTokenFilter stpf) throws IOException {
TypeAttribute typeAtt = stpf.getAttribute(TypeAttribute.class);
CharTermAttribute termAttribute = stpf.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncrAtt = stpf.getAttribute(PositionIncrementAttribute.class);
stpf.reset();
while (stpf.incrementToken()) {
log("Token: " + termAttribute.toString() + ": " + typeAtt.type() + " - " + posIncrAtt.getPositionIncrement());
assertEquals("if position increment is enabled the positionIncrementAttribute value should be 3, otherwise 1", posIncrAtt.getPositionIncrement(), 3);
}
stpf.end();
stpf.close();
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestTeeSinkTokenFilter method performance.
/**
* Not an explicit test, just useful to print out some info on performance
*/
@SuppressWarnings("resource")
public void performance() throws Exception {
int[] tokCount = { 100, 500, 1000, 2000, 5000, 10000 };
int[] modCounts = { 1, 2, 5, 10, 20, 50, 100, 200, 500 };
for (int k = 0; k < tokCount.length; k++) {
StringBuilder buffer = new StringBuilder();
System.out.println("-----Tokens: " + tokCount[k] + "-----");
for (int i = 0; i < tokCount[k]; i++) {
buffer.append(English.intToEnglish(i).toUpperCase(Locale.ROOT)).append(' ');
}
//make sure we produce the same tokens
TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(standardTokenizer(buffer)));
TokenStream sink = new ModuloTokenFilter(teeStream.newSinkTokenStream(), 100);
teeStream.consumeAllTokens();
TokenStream stream = new ModuloTokenFilter(new StandardFilter(standardTokenizer(buffer)), 100);
CharTermAttribute tfTok = stream.addAttribute(CharTermAttribute.class);
CharTermAttribute sinkTok = sink.addAttribute(CharTermAttribute.class);
for (int i = 0; stream.incrementToken(); i++) {
assertTrue(sink.incrementToken());
assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true);
}
//simulate two fields, each being analyzed once, for 20 documents
for (int j = 0; j < modCounts.length; j++) {
int tfPos = 0;
long start = System.currentTimeMillis();
for (int i = 0; i < 20; i++) {
stream = new StandardFilter(standardTokenizer(buffer));
PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
while (stream.incrementToken()) {
tfPos += posIncrAtt.getPositionIncrement();
}
stream = new ModuloTokenFilter(new StandardFilter(standardTokenizer(buffer)), modCounts[j]);
posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
while (stream.incrementToken()) {
tfPos += posIncrAtt.getPositionIncrement();
}
}
long finish = System.currentTimeMillis();
System.out.println("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
int sinkPos = 0;
//simulate one field with one sink
start = System.currentTimeMillis();
for (int i = 0; i < 20; i++) {
teeStream = new TeeSinkTokenFilter(new StandardFilter(standardTokenizer(buffer)));
sink = new ModuloTokenFilter(teeStream.newSinkTokenStream(), modCounts[j]);
PositionIncrementAttribute posIncrAtt = teeStream.getAttribute(PositionIncrementAttribute.class);
while (teeStream.incrementToken()) {
sinkPos += posIncrAtt.getPositionIncrement();
}
//System.out.println("Modulo--------");
posIncrAtt = sink.getAttribute(PositionIncrementAttribute.class);
while (sink.incrementToken()) {
sinkPos += posIncrAtt.getPositionIncrement();
}
}
finish = System.currentTimeMillis();
System.out.println("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos);
}
System.out.println("- End Tokens: " + tokCount[k] + "-----");
}
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestSnowball method testFilterTokens.
public void testFilterTokens() throws Exception {
SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class);
FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class);
filter.incrementToken();
assertEquals("accent", termAtt.toString());
assertEquals(2, offsetAtt.startOffset());
assertEquals(7, offsetAtt.endOffset());
assertEquals("wrd", typeAtt.type());
assertEquals(3, posIncAtt.getPositionIncrement());
assertEquals(77, flagsAtt.getFlags());
assertEquals(new BytesRef(new byte[] { 0, 1, 2, 3 }), payloadAtt.getPayload());
}
Aggregations