use of org.apache.lucene.analysis.util.TokenizerFactory in project lucene-solr by apache.
the class TestAllAnalyzersHaveFactories method test.
public void test() throws Exception {
List<Class<?>> analysisClasses = TestRandomChains.getClassesForPackage("org.apache.lucene.analysis");
for (final Class<?> c : analysisClasses) {
final int modifiers = c.getModifiers();
if (// don't waste time with abstract classes
Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers) || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface() || testComponents.contains(c) || crazyComponents.contains(c) || oddlyNamedComponents.contains(c) || tokenFiltersWithoutFactory.contains(c) || // deprecated ones are typically back compat hacks
c.isAnnotationPresent(Deprecated.class) || !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharFilter.class.isAssignableFrom(c))) {
continue;
}
Map<String, String> args = new HashMap<>();
args.put("luceneMatchVersion", Version.LATEST.toString());
if (Tokenizer.class.isAssignableFrom(c)) {
String clazzName = c.getSimpleName();
assertTrue(clazzName.endsWith("Tokenizer"));
String simpleName = clazzName.substring(0, clazzName.length() - 9);
assertNotNull(TokenizerFactory.lookupClass(simpleName));
TokenizerFactory instance = null;
try {
instance = TokenizerFactory.forName(simpleName, args);
assertNotNull(instance);
if (instance instanceof ResourceLoaderAware) {
((ResourceLoaderAware) instance).inform(loader);
}
assertSame(c, instance.create().getClass());
} catch (IllegalArgumentException e) {
// TODO: For now pass because some factories have not yet a default config that always works
}
} else if (TokenFilter.class.isAssignableFrom(c)) {
String clazzName = c.getSimpleName();
assertTrue(clazzName.endsWith("Filter"));
String simpleName = clazzName.substring(0, clazzName.length() - (clazzName.endsWith("TokenFilter") ? 11 : 6));
assertNotNull(TokenFilterFactory.lookupClass(simpleName));
TokenFilterFactory instance = null;
try {
instance = TokenFilterFactory.forName(simpleName, args);
assertNotNull(instance);
if (instance instanceof ResourceLoaderAware) {
((ResourceLoaderAware) instance).inform(loader);
}
Class<? extends TokenStream> createdClazz = instance.create(new KeywordTokenizer()).getClass();
// only check instance if factory have wrapped at all!
if (KeywordTokenizer.class != createdClazz) {
assertSame(c, createdClazz);
}
} catch (IllegalArgumentException e) {
// TODO: For now pass because some factories have not yet a default config that always works
}
} else if (CharFilter.class.isAssignableFrom(c)) {
String clazzName = c.getSimpleName();
assertTrue(clazzName.endsWith("CharFilter"));
String simpleName = clazzName.substring(0, clazzName.length() - 10);
assertNotNull(CharFilterFactory.lookupClass(simpleName));
CharFilterFactory instance = null;
try {
instance = CharFilterFactory.forName(simpleName, args);
assertNotNull(instance);
if (instance instanceof ResourceLoaderAware) {
((ResourceLoaderAware) instance).inform(loader);
}
Class<? extends Reader> createdClazz = instance.create(new StringReader("")).getClass();
// only check instance if factory have wrapped at all!
if (StringReader.class != createdClazz) {
assertSame(c, createdClazz);
}
} catch (IllegalArgumentException e) {
// TODO: For now pass because some factories have not yet a default config that always works
}
}
}
}
use of org.apache.lucene.analysis.util.TokenizerFactory in project lucene-solr by apache.
the class AnalyzerFactoryTask method createAnalysisPipelineComponent.
/**
* Instantiates the given analysis factory class after pulling params from
* the given stream tokenizer, then stores the result in the appropriate
* pipeline component list.
*
* @param stok stream tokenizer from which to draw analysis factory params
* @param clazz analysis factory class to instantiate
*/
@SuppressWarnings("fallthrough")
private void createAnalysisPipelineComponent(StreamTokenizer stok, Class<? extends AbstractAnalysisFactory> clazz) {
Map<String, String> argMap = new HashMap<>();
boolean parenthetical = false;
try {
WHILE_LOOP: while (stok.nextToken() != StreamTokenizer.TT_EOF) {
switch(stok.ttype) {
case ',':
{
if (parenthetical) {
// Do nothing
break;
} else {
// Finished reading this analysis factory configuration
break WHILE_LOOP;
}
}
case '(':
{
if (parenthetical) {
throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected opening parenthesis.");
}
parenthetical = true;
break;
}
case ')':
{
if (parenthetical) {
parenthetical = false;
} else {
throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected closing parenthesis.");
}
break;
}
case StreamTokenizer.TT_WORD:
{
if (!parenthetical) {
throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected token '" + stok.sval + "'");
}
String argName = stok.sval;
stok.nextToken();
if (stok.ttype != ':') {
throw new RuntimeException("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to " + clazz.getSimpleName());
}
stok.nextToken();
String argValue = stok.sval;
switch(stok.ttype) {
case StreamTokenizer.TT_NUMBER:
{
argValue = Double.toString(stok.nval);
// Drop the ".0" from numbers, for integer arguments
argValue = TRAILING_DOT_ZERO_PATTERN.matcher(argValue).replaceFirst("");
// Intentional fall-through
}
case '"':
case '\'':
case StreamTokenizer.TT_WORD:
{
argMap.put(argName, argValue);
break;
}
case StreamTokenizer.TT_EOF:
{
throw new RuntimeException("Unexpected EOF: " + stok.toString());
}
default:
{
throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
}
}
}
}
}
if (!argMap.containsKey("luceneMatchVersion")) {
argMap.put("luceneMatchVersion", Version.LATEST.toString());
}
final AbstractAnalysisFactory instance;
try {
instance = clazz.getConstructor(Map.class).newInstance(argMap);
} catch (Exception e) {
throw new RuntimeException("Line #" + lineno(stok) + ": ", e);
}
if (instance instanceof ResourceLoaderAware) {
Path baseDir = Paths.get(getRunData().getConfig().get("work.dir", "work"));
if (!Files.isDirectory(baseDir)) {
baseDir = Paths.get(".");
}
((ResourceLoaderAware) instance).inform(new FilesystemResourceLoader(baseDir));
}
if (CharFilterFactory.class.isAssignableFrom(clazz)) {
charFilterFactories.add((CharFilterFactory) instance);
} else if (TokenizerFactory.class.isAssignableFrom(clazz)) {
tokenizerFactory = (TokenizerFactory) instance;
} else if (TokenFilterFactory.class.isAssignableFrom(clazz)) {
tokenFilterFactories.add((TokenFilterFactory) instance);
}
} catch (RuntimeException e) {
if (e.getMessage().startsWith("Line #")) {
throw (e);
} else {
throw new RuntimeException("Line #" + lineno(stok) + ": ", e);
}
} catch (Throwable t) {
throw new RuntimeException("Line #" + lineno(stok) + ": ", t);
}
}
use of org.apache.lucene.analysis.util.TokenizerFactory in project tika by apache.
the class AnalyzerDeserializer method buildAnalyzer.
public static Analyzer buildAnalyzer(String analyzerName, JsonElement value, int maxTokens) throws IOException {
if (!value.isJsonObject()) {
throw new IllegalArgumentException("Expecting map of charfilter, tokenizer, tokenfilters");
}
JsonObject aRoot = (JsonObject) value;
CharFilterFactory[] charFilters = new CharFilterFactory[0];
TokenizerFactory tokenizerFactory = null;
TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
for (Map.Entry<String, JsonElement> e : aRoot.entrySet()) {
String k = e.getKey();
if (k.equals(CHAR_FILTERS)) {
charFilters = buildCharFilters(e.getValue(), analyzerName);
} else if (k.equals(TOKEN_FILTERS)) {
tokenFilterFactories = buildTokenFilterFactories(e.getValue(), analyzerName, maxTokens);
} else if (k.equals(TOKENIZER)) {
tokenizerFactory = buildTokenizerFactory(e.getValue(), analyzerName);
} else if (!k.equals(COMMENT)) {
throw new IllegalArgumentException("Should have one of three values here:" + CHAR_FILTERS + ", " + TOKENIZER + ", " + TOKEN_FILTERS + ". I don't recognize: " + k);
}
}
if (tokenizerFactory == null) {
throw new IllegalArgumentException("Must specify at least a tokenizer factory for an analyzer!");
}
return new MyTokenizerChain(charFilters, tokenizerFactory, tokenFilterFactories);
}
use of org.apache.lucene.analysis.util.TokenizerFactory in project tika by apache.
the class AnalyzerDeserializer method buildTokenizerFactory.
private static TokenizerFactory buildTokenizerFactory(JsonElement map, String analyzerName) throws IOException {
if (!(map instanceof JsonObject)) {
throw new IllegalArgumentException("Expecting a map with \"factory\" string and " + "\"params\" map in tokenizer factory;" + " not: " + map.toString() + " in " + analyzerName);
}
JsonElement factoryEl = ((JsonObject) map).get(FACTORY);
if (factoryEl == null || !factoryEl.isJsonPrimitive()) {
throw new IllegalArgumentException("Expecting value for factory in char filter factory builder in:" + analyzerName);
}
String factoryName = factoryEl.getAsString();
factoryName = factoryName.startsWith("oala.") ? factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.") : factoryName;
JsonElement paramsEl = ((JsonObject) map).get(PARAMS);
Map<String, String> params = mapify(paramsEl);
String spiName = "";
for (String s : TokenizerFactory.availableTokenizers()) {
Class clazz = TokenizerFactory.lookupClass(s);
if (clazz.getName().equals(factoryName)) {
spiName = s;
break;
}
}
if (spiName.equals("")) {
throw new IllegalArgumentException("A SPI class of type org.apache.lucene.analysis.util.TokenizerFactory with name" + "'" + factoryName + "' does not exist.");
}
try {
TokenizerFactory tokenizerFactory = TokenizerFactory.forName(spiName, params);
if (tokenizerFactory instanceof ResourceLoaderAware) {
((ResourceLoaderAware) tokenizerFactory).inform(new ClasspathResourceLoader(AnalyzerDeserializer.class));
}
return tokenizerFactory;
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException("While working on " + analyzerName, e);
}
}
Aggregations