use of org.apache.lucene.analysis.util.TokenizerFactory in project jackrabbit-oak by apache.
the class NodeStateAnalyzerFactory method composeAnalyzer.
private Analyzer composeAnalyzer(NodeState state) {
TokenizerFactory tf = loadTokenizer(state.getChildNode(LuceneIndexConstants.ANL_TOKENIZER));
CharFilterFactory[] cfs = loadCharFilterFactories(state.getChildNode(LuceneIndexConstants.ANL_CHAR_FILTERS));
TokenFilterFactory[] tffs = loadTokenFilterFactories(state.getChildNode(LuceneIndexConstants.ANL_FILTERS));
return new TokenizerChain(cfs, tf, tffs);
}
use of org.apache.lucene.analysis.util.TokenizerFactory in project lucene-solr by apache.
the class SynonymFilterFactory method inform.
@Override
public void inform(ResourceLoader loader) throws IOException {
final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory);
Analyzer analyzer;
if (analyzerName != null) {
analyzer = loadAnalyzer(loader, analyzerName);
} else {
analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer() : factory.create();
TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
return new TokenStreamComponents(tokenizer, stream);
}
};
}
try (Analyzer a = analyzer) {
String formatClass = format;
if (format == null || format.equals("solr")) {
formatClass = SolrSynonymParser.class.getName();
} else if (format.equals("wordnet")) {
formatClass = WordnetSynonymParser.class.getName();
}
// TODO: expose dedup as a parameter?
map = loadSynonyms(loader, formatClass, true, a);
} catch (ParseException e) {
throw new IOException("Error parsing synonyms file:", e);
}
}
use of org.apache.lucene.analysis.util.TokenizerFactory in project lucene-solr by apache.
the class SynonymGraphFilterFactory method inform.
@Override
public void inform(ResourceLoader loader) throws IOException {
final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory);
Analyzer analyzer;
if (analyzerName != null) {
analyzer = loadAnalyzer(loader, analyzerName);
} else {
analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer() : factory.create();
TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
return new TokenStreamComponents(tokenizer, stream);
}
};
}
try (Analyzer a = analyzer) {
String formatClass = format;
if (format == null || format.equals("solr")) {
formatClass = SolrSynonymParser.class.getName();
} else if (format.equals("wordnet")) {
formatClass = WordnetSynonymParser.class.getName();
}
// TODO: expose dedup as a parameter?
map = loadSynonyms(loader, formatClass, true, a);
} catch (ParseException e) {
throw new IOException("Error parsing synonyms file:", e);
}
}
use of org.apache.lucene.analysis.util.TokenizerFactory in project lucene-solr by apache.
the class TestFactories method doTestTokenizer.
private void doTestTokenizer(String tokenizer) throws IOException {
Class<? extends TokenizerFactory> factoryClazz = TokenizerFactory.lookupClass(tokenizer);
TokenizerFactory factory = (TokenizerFactory) initialize(factoryClazz);
if (factory != null) {
// if it implements MultiTermAware, sanity check its impl
if (factory instanceof MultiTermAwareComponent) {
AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent();
assertNotNull(mtc);
// it's not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it
assertFalse(mtc instanceof CharFilterFactory);
}
// beast it just a little, it shouldnt throw exceptions:
// (it should have thrown them in initialize)
Analyzer a = new FactoryAnalyzer(factory, null, null);
checkRandomData(random(), a, 20, 20, false, false);
a.close();
}
}
use of org.apache.lucene.analysis.util.TokenizerFactory in project lucene-solr by apache.
the class AnalyzerFactoryTask method setParams.
/**
* Sets the params.
* Analysis component factory names may optionally include the "Factory" suffix.
*
* @param params analysis pipeline specification: name, (optional) positionIncrementGap,
* (optional) offsetGap, 0+ CharFilterFactory's, 1 TokenizerFactory,
* and 0+ TokenFilterFactory's
*/
@Override
@SuppressWarnings("fallthrough")
public void setParams(String params) {
super.setParams(params);
ArgType expectedArgType = ArgType.ANALYZER_ARG;
final StreamTokenizer stok = new StreamTokenizer(new StringReader(params));
stok.commentChar('#');
stok.quoteChar('"');
stok.quoteChar('\'');
stok.eolIsSignificant(false);
stok.ordinaryChar('(');
stok.ordinaryChar(')');
stok.ordinaryChar(':');
stok.ordinaryChar(',');
try {
while (stok.nextToken() != StreamTokenizer.TT_EOF) {
switch(stok.ttype) {
case ',':
{
// Do nothing
break;
}
case StreamTokenizer.TT_WORD:
{
if (expectedArgType.equals(ArgType.ANALYZER_ARG)) {
final String argName = stok.sval;
if (!argName.equalsIgnoreCase("name") && !argName.equalsIgnoreCase("positionIncrementGap") && !argName.equalsIgnoreCase("offsetGap")) {
throw new RuntimeException("Line #" + lineno(stok) + ": Missing 'name' param to AnalyzerFactory: '" + params + "'");
}
stok.nextToken();
if (stok.ttype != ':') {
throw new RuntimeException("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to AnalyzerFactory");
}
stok.nextToken();
String argValue = stok.sval;
switch(stok.ttype) {
case StreamTokenizer.TT_NUMBER:
{
argValue = Double.toString(stok.nval);
// Drop the ".0" from numbers, for integer arguments
argValue = TRAILING_DOT_ZERO_PATTERN.matcher(argValue).replaceFirst("");
// Intentional fallthrough
}
case '"':
case '\'':
case StreamTokenizer.TT_WORD:
{
if (argName.equalsIgnoreCase("name")) {
factoryName = argValue;
expectedArgType = ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER;
} else {
int intArgValue = 0;
try {
intArgValue = Integer.parseInt(argValue);
} catch (NumberFormatException e) {
throw new RuntimeException("Line #" + lineno(stok) + ": Exception parsing " + argName + " value '" + argValue + "'", e);
}
if (argName.equalsIgnoreCase("positionIncrementGap")) {
positionIncrementGap = intArgValue;
} else if (argName.equalsIgnoreCase("offsetGap")) {
offsetGap = intArgValue;
}
}
break;
}
case StreamTokenizer.TT_EOF:
{
throw new RuntimeException("Unexpected EOF: " + stok.toString());
}
default:
{
throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
}
}
} else if (expectedArgType.equals(ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER)) {
final String argName = stok.sval;
if (argName.equalsIgnoreCase("positionIncrementGap") || argName.equalsIgnoreCase("offsetGap")) {
stok.nextToken();
if (stok.ttype != ':') {
throw new RuntimeException("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to AnalyzerFactory");
}
stok.nextToken();
int intArgValue = (int) stok.nval;
switch(stok.ttype) {
case '"':
case '\'':
case StreamTokenizer.TT_WORD:
{
intArgValue = 0;
try {
intArgValue = Integer.parseInt(stok.sval.trim());
} catch (NumberFormatException e) {
throw new RuntimeException("Line #" + lineno(stok) + ": Exception parsing " + argName + " value '" + stok.sval + "'", e);
}
// Intentional fall-through
}
case StreamTokenizer.TT_NUMBER:
{
if (argName.equalsIgnoreCase("positionIncrementGap")) {
positionIncrementGap = intArgValue;
} else if (argName.equalsIgnoreCase("offsetGap")) {
offsetGap = intArgValue;
}
break;
}
case StreamTokenizer.TT_EOF:
{
throw new RuntimeException("Unexpected EOF: " + stok.toString());
}
default:
{
throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
}
}
break;
}
try {
final Class<? extends CharFilterFactory> clazz;
clazz = lookupAnalysisClass(argName, CharFilterFactory.class);
createAnalysisPipelineComponent(stok, clazz);
} catch (IllegalArgumentException e) {
try {
final Class<? extends TokenizerFactory> clazz;
clazz = lookupAnalysisClass(argName, TokenizerFactory.class);
createAnalysisPipelineComponent(stok, clazz);
expectedArgType = ArgType.TOKENFILTER;
} catch (IllegalArgumentException e2) {
throw new RuntimeException("Line #" + lineno(stok) + ": Can't find class '" + argName + "' as CharFilterFactory or TokenizerFactory");
}
}
} else {
// expectedArgType = ArgType.TOKENFILTER
final String className = stok.sval;
final Class<? extends TokenFilterFactory> clazz;
try {
clazz = lookupAnalysisClass(className, TokenFilterFactory.class);
} catch (IllegalArgumentException e) {
throw new RuntimeException("Line #" + lineno(stok) + ": Can't find class '" + className + "' as TokenFilterFactory");
}
createAnalysisPipelineComponent(stok, clazz);
}
break;
}
default:
{
throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
}
}
}
} catch (RuntimeException e) {
if (e.getMessage().startsWith("Line #")) {
throw e;
} else {
throw new RuntimeException("Line #" + lineno(stok) + ": ", e);
}
} catch (Throwable t) {
throw new RuntimeException("Line #" + lineno(stok) + ": ", t);
}
final AnalyzerFactory analyzerFactory = new AnalyzerFactory(charFilterFactories, tokenizerFactory, tokenFilterFactories);
analyzerFactory.setPositionIncrementGap(positionIncrementGap);
analyzerFactory.setOffsetGap(offsetGap);
getRunData().getAnalyzerFactories().put(factoryName, analyzerFactory);
}
Aggregations