use of org.apache.lucene.util.AttributeFactory in project lucene-solr by apache.
the class TestKeywordTokenizer method testFactory.
public void testFactory() {
Map<String, String> args = new HashMap<>();
KeywordTokenizerFactory factory = new KeywordTokenizerFactory(args);
AttributeFactory attributeFactory = newAttributeFactory();
Tokenizer tokenizer = factory.create(attributeFactory);
assertEquals(KeywordTokenizer.class, tokenizer.getClass());
}
use of org.apache.lucene.util.AttributeFactory in project lucene-solr by apache.
the class TestUnicodeWhitespaceTokenizer method testParamsFactory.
public void testParamsFactory() throws IOException {
// negative maxTokenLen
IllegalArgumentException iae = expectThrows(IllegalArgumentException.class, () -> new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "-1")));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", iae.getMessage());
// zero maxTokenLen
iae = expectThrows(IllegalArgumentException.class, () -> new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "0")));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", iae.getMessage());
// Added random param, should throw illegal error
iae = expectThrows(IllegalArgumentException.class, () -> new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "255", "randomParam", "rValue")));
assertEquals("Unknown parameters: {randomParam=rValue}", iae.getMessage());
// tokeniser will split at 5, Token | izer, no matter what happens
WhitespaceTokenizerFactory factory = new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "5"));
AttributeFactory attributeFactory = newAttributeFactory();
Tokenizer tokenizer = factory.create(attributeFactory);
StringReader reader = new StringReader("Tokenizer štest");
tokenizer.setReader(reader);
assertTokenStreamContents(tokenizer, new String[] { "Token", "izer", "štes", "t" });
// tokeniser will split at 2, To | ke | ni | ze | r, no matter what happens
factory = new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "2"));
attributeFactory = newAttributeFactory();
tokenizer = factory.create(attributeFactory);
reader = new StringReader("TokenizerĀ test");
tokenizer.setReader(reader);
assertTokenStreamContents(tokenizer, new String[] { "To", "ke", "ni", "ze", "r", "te", "st" });
// tokeniser will split at 10, no matter what happens,
// but tokens' length are less than that
factory = new WhitespaceTokenizerFactory(makeArgs("rule", "unicode", "maxTokenLen", "10"));
attributeFactory = newAttributeFactory();
tokenizer = factory.create(attributeFactory);
reader = new StringReader("TokenizerĀ test");
tokenizer.setReader(reader);
assertTokenStreamContents(tokenizer, new String[] { "Tokenizer", "test" });
}
use of org.apache.lucene.util.AttributeFactory in project lucene-solr by apache.
the class FieldAnalysisRequestHandlerTest method testCustomAttribute.
//See SOLR-8460
@Test
public void testCustomAttribute() throws Exception {
FieldAnalysisRequest request = new FieldAnalysisRequest();
request.addFieldType("skutype1");
request.setFieldValue("hi, 3456-12 a Test");
request.setShowMatch(false);
FieldType fieldType = new TextField();
Analyzer analyzer = new TokenizerChain(new TokenizerFactory(Collections.emptyMap()) {
@Override
public Tokenizer create(AttributeFactory factory) {
return new CustomTokenizer(factory);
}
}, new TokenFilterFactory[] { new TokenFilterFactory(Collections.emptyMap()) {
@Override
public TokenStream create(TokenStream input) {
return new CustomTokenFilter(input);
}
} });
fieldType.setIndexAnalyzer(analyzer);
NamedList<NamedList> result = handler.analyzeValues(request, fieldType, "fieldNameUnused");
// just test that we see "900" in the flags attribute here
List<NamedList> tokenInfoList = (List<NamedList>) result.findRecursive("index", CustomTokenFilter.class.getName());
// '1' from CustomTokenFilter plus 900 from CustomFlagsAttributeImpl.
assertEquals(901, tokenInfoList.get(0).get("org.apache.lucene.analysis.tokenattributes.FlagsAttribute#flags"));
}
use of org.apache.lucene.util.AttributeFactory in project lucene-solr by apache.
the class TestUnicodeWhitespaceTokenizer method testFactory.
public void testFactory() {
Map<String, String> args = new HashMap<>();
args.put("rule", "unicode");
WhitespaceTokenizerFactory factory = new WhitespaceTokenizerFactory(args);
AttributeFactory attributeFactory = newAttributeFactory();
Tokenizer tokenizer = factory.create(attributeFactory);
assertEquals(UnicodeWhitespaceTokenizer.class, tokenizer.getClass());
}
use of org.apache.lucene.util.AttributeFactory in project lucene-solr by apache.
the class Analyzer method normalize.
/**
* Normalize a string down to the representation that it would have in the
* index.
* <p>
* This is typically used by query parsers in order to generate a query on
* a given term, without tokenizing or stemming, which are undesirable if
* the string to analyze is a partial word (eg. in case of a wildcard or
* fuzzy query).
* <p>
* This method uses {@link #initReaderForNormalization(String, Reader)} in
* order to apply necessary character-level normalization and then
* {@link #normalize(String, TokenStream)} in order to apply the normalizing
* token filters.
*/
public final BytesRef normalize(final String fieldName, final String text) {
try {
// apply char filters
final String filteredText;
try (Reader reader = new StringReader(text)) {
Reader filterReader = initReaderForNormalization(fieldName, reader);
char[] buffer = new char[64];
StringBuilder builder = new StringBuilder();
for (; ; ) {
final int read = filterReader.read(buffer, 0, buffer.length);
if (read == -1) {
break;
}
builder.append(buffer, 0, read);
}
filteredText = builder.toString();
} catch (IOException e) {
throw new IllegalStateException("Normalization threw an unexpected exeption", e);
}
final AttributeFactory attributeFactory = attributeFactory(fieldName);
try (TokenStream ts = normalize(fieldName, new StringTokenStream(attributeFactory, filteredText, text.length()))) {
final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
ts.reset();
if (ts.incrementToken() == false) {
throw new IllegalStateException("The normalization token stream is " + "expected to produce exactly 1 token, but got 0 for analyzer " + this + " and input \"" + text + "\"");
}
final BytesRef term = BytesRef.deepCopyOf(termAtt.getBytesRef());
if (ts.incrementToken()) {
throw new IllegalStateException("The normalization token stream is " + "expected to produce exactly 1 token, but got 2+ for analyzer " + this + " and input \"" + text + "\"");
}
ts.end();
return term;
}
} catch (IOException e) {
throw new IllegalStateException("Normalization threw an unexpected exeption", e);
}
}
Aggregations