use of org.apache.lucene.analysis.util.TokenFilterFactory in project stanbol by apache.
the class LuceneLabelTokenizer method tokenize.
@Override
public String[] tokenize(String label, String language) {
if (label == null) {
throw new IllegalArgumentException("The parsed label MUST NOT be NULL!");
}
if ((language == null && langConf.useWildcard()) || langConf.isLanguage(language)) {
if (label.isEmpty()) {
return EMPTY;
}
Reader reader = new StringReader(label);
TokenStream tokenizer;
if (charFilterFactory != null) {
tokenizer = tokenizerFactory.create(charFilterFactory.create(reader));
} else {
tokenizer = tokenizerFactory.create(reader);
}
//build the analysing chain
for (TokenFilterFactory filterFactory : filterFactories) {
tokenizer = filterFactory.create(tokenizer);
}
List<String> tokens = new ArrayList<String>(8);
try {
tokenizer.reset();
while (tokenizer.incrementToken()) {
OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class);
tokens.add(label.substring(offset.startOffset(), offset.endOffset()));
}
tokenizer.end();
tokenizer.close();
} catch (IOException e) {
log.error("IOException while reading from a StringReader :(", e);
return null;
}
return tokens.toArray(new String[tokens.size()]);
} else {
log.trace("Language {} not configured to be supported", language);
return null;
}
}
use of org.apache.lucene.analysis.util.TokenFilterFactory in project stanbol by apache.
the class LuceneLabelTokenizer method activate.
@Activate
protected void activate(ComponentContext ctx) throws ConfigurationException {
//init the Solr ResourceLoader used for initialising the components
resourceLoader = new StanbolResourceLoader(parentResourceLoader);
//init the Solr CharFilterFactory (optional)
Object value = ctx.getProperties().get(PROPERTY_CHAR_FILTER_FACTORY);
if (value != null && !value.toString().isEmpty() && !DEFAULT_CLASS_NAME_CONFIG.equals(value)) {
Entry<String, Map<String, String>> charFilterConfig = parseConfigLine(PROPERTY_CHAR_FILTER_FACTORY, value.toString());
charFilterFactory = initAnalyzer(PROPERTY_CHAR_FILTER_FACTORY, charFilterConfig.getKey(), CharFilterFactory.class, charFilterConfig.getValue());
} else {
charFilterFactory = null;
}
//now initialise the TokenizerFactory (required)
value = ctx.getProperties().get(PROPERTY_TOKENIZER_FACTORY);
if (value == null || value.toString().isEmpty() || DEFAULT_CLASS_NAME_CONFIG.equals(value)) {
throw new ConfigurationException(PROPERTY_TOKENIZER_FACTORY, "The class name of the Lucene Tokemizer MUST BE configured");
}
Entry<String, Map<String, String>> tokenizerConfig = parseConfigLine(PROPERTY_CHAR_FILTER_FACTORY, value.toString());
tokenizerFactory = initAnalyzer(PROPERTY_TOKENIZER_FACTORY, tokenizerConfig.getKey(), TokenizerFactory.class, tokenizerConfig.getValue());
//initialise the list of Token Filters
Collection<String> values;
value = ctx.getProperties().get(PROPERTY_TOKEN_FILTER_FACTORY);
if (value == null) {
values = Collections.emptyList();
} else if (value instanceof Collection<?>) {
values = new ArrayList<String>(((Collection<?>) value).size());
for (Object v : (Collection<Object>) value) {
values.add(v.toString());
}
} else if (value instanceof String[]) {
values = Arrays.asList((String[]) value);
} else if (value instanceof String) {
values = Collections.singleton((String) value);
} else {
throw new ConfigurationException(PROPERTY_TOKEN_FILTER_FACTORY, "The type '" + value.getClass() + "' of the parsed value is not supported (supported are " + "Collections, String[] and String values)!");
}
for (String filterConfigLine : values) {
if (filterConfigLine == null || filterConfigLine.isEmpty() || DEFAULT_CLASS_NAME_CONFIG.equals(filterConfigLine)) {
//ignore null, empty and the default value
continue;
}
Entry<String, Map<String, String>> filterConfig = parseConfigLine(PROPERTY_CHAR_FILTER_FACTORY, filterConfigLine);
TokenFilterFactory tff = initAnalyzer(PROPERTY_TOKEN_FILTER_FACTORY, filterConfig.getKey(), TokenFilterFactory.class, filterConfig.getValue());
filterFactories.add(tff);
}
//init the language configuration
value = ctx.getProperties().get(LabelTokenizer.SUPPORTED_LANUAGES);
if (value == null) {
throw new ConfigurationException(LabelTokenizer.SUPPORTED_LANUAGES, "The language " + "configuration MUST BE present!");
}
langConf.setConfiguration(ctx.getProperties());
}
use of org.apache.lucene.analysis.util.TokenFilterFactory in project tika by apache.
the class AnalyzerDeserializer method buildTokenFilterFactories.
private static TokenFilterFactory[] buildTokenFilterFactories(JsonElement el, String analyzerName, int maxTokens) throws IOException {
if (el == null || el.isJsonNull()) {
return null;
}
if (!el.isJsonArray()) {
throw new IllegalArgumentException("Expecting array for tokenfilters, but got:" + el.toString() + " in " + analyzerName);
}
JsonArray jsonArray = (JsonArray) el;
List<TokenFilterFactory> ret = new LinkedList<>();
for (JsonElement filterMap : jsonArray) {
if (!(filterMap instanceof JsonObject)) {
throw new IllegalArgumentException("Expecting a map with \"factory\" string and \"params\" map in token filter factory;" + " not: " + filterMap.toString() + " in " + analyzerName);
}
JsonElement factoryEl = ((JsonObject) filterMap).get(FACTORY);
if (factoryEl == null || !factoryEl.isJsonPrimitive()) {
throw new IllegalArgumentException("Expecting value for factory in token filter factory builder in " + analyzerName);
}
String factoryName = factoryEl.getAsString();
factoryName = factoryName.startsWith("oala.") ? factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.") : factoryName;
JsonElement paramsEl = ((JsonObject) filterMap).get(PARAMS);
Map<String, String> params = mapify(paramsEl);
String spiName = "";
for (String s : TokenFilterFactory.availableTokenFilters()) {
Class clazz = TokenFilterFactory.lookupClass(s);
if (clazz.getName().equals(factoryName)) {
spiName = s;
break;
}
}
if (spiName.equals("")) {
throw new IllegalArgumentException("A SPI class of type org.apache.lucene.analysis.util.TokenFilterFactory with name" + "'" + factoryName + "' does not exist.");
}
try {
TokenFilterFactory tokenFilterFactory = TokenFilterFactory.forName(spiName, params);
if (tokenFilterFactory instanceof ResourceLoaderAware) {
((ResourceLoaderAware) tokenFilterFactory).inform(new ClasspathResourceLoader(AnalyzerDeserializer.class));
}
ret.add(tokenFilterFactory);
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException("While loading " + analyzerName, e);
}
}
if (maxTokens > -1) {
Map<String, String> m = new HashMap<>();
m.put("maxTokenCount", Integer.toString(maxTokens));
ret.add(new LimitTokenCountFilterFactory(m));
}
if (ret.size() == 0) {
return new TokenFilterFactory[0];
}
return ret.toArray(new TokenFilterFactory[ret.size()]);
}
use of org.apache.lucene.analysis.util.TokenFilterFactory in project jackrabbit-oak by apache.
the class NodeStateAnalyzerFactory method loadTokenFilterFactories.
private TokenFilterFactory[] loadTokenFilterFactories(NodeState tokenFiltersState) {
List<TokenFilterFactory> result = newArrayList();
Tree tree = TreeFactory.createReadOnlyTree(tokenFiltersState);
for (Tree t : tree.getChildren()) {
NodeState state = tokenFiltersState.getChildNode(t.getName());
String factoryType = getFactoryType(state, t.getName());
Map<String, String> args = convertNodeState(state);
TokenFilterFactory cf = TokenFilterFactory.forName(factoryType, args);
init(cf, state);
result.add(cf);
}
return result.toArray(new TokenFilterFactory[result.size()]);
}
use of org.apache.lucene.analysis.util.TokenFilterFactory in project lucene-solr by apache.
the class CustomAnalyzer method normalize.
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = in;
for (TokenFilterFactory filter : tokenFilters) {
if (filter instanceof MultiTermAwareComponent) {
filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent();
result = filter.create(result);
}
}
return result;
}
Aggregations