Search in sources :

Example 21 with TokenFilterFactory

use of org.apache.lucene.analysis.util.TokenFilterFactory in project stanbol by apache.

the class LuceneLabelTokenizer method tokenize.

@Override
public String[] tokenize(String label, String language) {
    if (label == null) {
        throw new IllegalArgumentException("The parsed label MUST NOT be NULL!");
    }
    if ((language == null && langConf.useWildcard()) || langConf.isLanguage(language)) {
        if (label.isEmpty()) {
            return EMPTY;
        }
        Reader reader = new StringReader(label);
        TokenStream tokenizer;
        if (charFilterFactory != null) {
            tokenizer = tokenizerFactory.create(charFilterFactory.create(reader));
        } else {
            tokenizer = tokenizerFactory.create(reader);
        }
        //build the analysing chain
        for (TokenFilterFactory filterFactory : filterFactories) {
            tokenizer = filterFactory.create(tokenizer);
        }
        List<String> tokens = new ArrayList<String>(8);
        try {
            tokenizer.reset();
            while (tokenizer.incrementToken()) {
                OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class);
                tokens.add(label.substring(offset.startOffset(), offset.endOffset()));
            }
            tokenizer.end();
            tokenizer.close();
        } catch (IOException e) {
            log.error("IOException while reading from a StringReader :(", e);
            return null;
        }
        return tokens.toArray(new String[tokens.size()]);
    } else {
        log.trace("Language {} not configured to be supported", language);
        return null;
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StringReader(java.io.StringReader) ArrayList(java.util.ArrayList) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) CharArrayReader(java.io.CharArrayReader) Reader(java.io.Reader) StringReader(java.io.StringReader) IOException(java.io.IOException) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory)

Example 22 with TokenFilterFactory

use of org.apache.lucene.analysis.util.TokenFilterFactory in project stanbol by apache.

the class LuceneLabelTokenizer method activate.

@Activate
protected void activate(ComponentContext ctx) throws ConfigurationException {
    //init the Solr ResourceLoader used for initialising the components
    resourceLoader = new StanbolResourceLoader(parentResourceLoader);
    //init the Solr CharFilterFactory (optional)
    Object value = ctx.getProperties().get(PROPERTY_CHAR_FILTER_FACTORY);
    if (value != null && !value.toString().isEmpty() && !DEFAULT_CLASS_NAME_CONFIG.equals(value)) {
        Entry<String, Map<String, String>> charFilterConfig = parseConfigLine(PROPERTY_CHAR_FILTER_FACTORY, value.toString());
        charFilterFactory = initAnalyzer(PROPERTY_CHAR_FILTER_FACTORY, charFilterConfig.getKey(), CharFilterFactory.class, charFilterConfig.getValue());
    } else {
        charFilterFactory = null;
    }
    //now initialise the TokenizerFactory (required)
    value = ctx.getProperties().get(PROPERTY_TOKENIZER_FACTORY);
    if (value == null || value.toString().isEmpty() || DEFAULT_CLASS_NAME_CONFIG.equals(value)) {
        throw new ConfigurationException(PROPERTY_TOKENIZER_FACTORY, "The class name of the Lucene Tokemizer MUST BE configured");
    }
    Entry<String, Map<String, String>> tokenizerConfig = parseConfigLine(PROPERTY_CHAR_FILTER_FACTORY, value.toString());
    tokenizerFactory = initAnalyzer(PROPERTY_TOKENIZER_FACTORY, tokenizerConfig.getKey(), TokenizerFactory.class, tokenizerConfig.getValue());
    //initialise the list of Token Filters
    Collection<String> values;
    value = ctx.getProperties().get(PROPERTY_TOKEN_FILTER_FACTORY);
    if (value == null) {
        values = Collections.emptyList();
    } else if (value instanceof Collection<?>) {
        values = new ArrayList<String>(((Collection<?>) value).size());
        for (Object v : (Collection<Object>) value) {
            values.add(v.toString());
        }
    } else if (value instanceof String[]) {
        values = Arrays.asList((String[]) value);
    } else if (value instanceof String) {
        values = Collections.singleton((String) value);
    } else {
        throw new ConfigurationException(PROPERTY_TOKEN_FILTER_FACTORY, "The type '" + value.getClass() + "' of the parsed value is not supported (supported are " + "Collections, String[] and String values)!");
    }
    for (String filterConfigLine : values) {
        if (filterConfigLine == null || filterConfigLine.isEmpty() || DEFAULT_CLASS_NAME_CONFIG.equals(filterConfigLine)) {
            //ignore null, empty and the default value
            continue;
        }
        Entry<String, Map<String, String>> filterConfig = parseConfigLine(PROPERTY_CHAR_FILTER_FACTORY, filterConfigLine);
        TokenFilterFactory tff = initAnalyzer(PROPERTY_TOKEN_FILTER_FACTORY, filterConfig.getKey(), TokenFilterFactory.class, filterConfig.getValue());
        filterFactories.add(tff);
    }
    //init the language configuration
    value = ctx.getProperties().get(LabelTokenizer.SUPPORTED_LANUAGES);
    if (value == null) {
        throw new ConfigurationException(LabelTokenizer.SUPPORTED_LANUAGES, "The language " + "configuration MUST BE present!");
    }
    langConf.setConfiguration(ctx.getProperties());
}
Also used : StanbolResourceLoader(org.apache.stanbol.commons.solr.utils.StanbolResourceLoader) TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) ConfigurationException(org.osgi.service.cm.ConfigurationException) CharFilterFactory(org.apache.lucene.analysis.util.CharFilterFactory) ArrayList(java.util.ArrayList) HashMap(java.util.HashMap) Map(java.util.Map) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory) Activate(org.apache.felix.scr.annotations.Activate)

Example 23 with TokenFilterFactory

use of org.apache.lucene.analysis.util.TokenFilterFactory in project tika by apache.

the class AnalyzerDeserializer method buildTokenFilterFactories.

private static TokenFilterFactory[] buildTokenFilterFactories(JsonElement el, String analyzerName, int maxTokens) throws IOException {
    if (el == null || el.isJsonNull()) {
        return null;
    }
    if (!el.isJsonArray()) {
        throw new IllegalArgumentException("Expecting array for tokenfilters, but got:" + el.toString() + " in " + analyzerName);
    }
    JsonArray jsonArray = (JsonArray) el;
    List<TokenFilterFactory> ret = new LinkedList<>();
    for (JsonElement filterMap : jsonArray) {
        if (!(filterMap instanceof JsonObject)) {
            throw new IllegalArgumentException("Expecting a map with \"factory\" string and \"params\" map in token filter factory;" + " not: " + filterMap.toString() + " in " + analyzerName);
        }
        JsonElement factoryEl = ((JsonObject) filterMap).get(FACTORY);
        if (factoryEl == null || !factoryEl.isJsonPrimitive()) {
            throw new IllegalArgumentException("Expecting value for factory in token filter factory builder in " + analyzerName);
        }
        String factoryName = factoryEl.getAsString();
        factoryName = factoryName.startsWith("oala.") ? factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.") : factoryName;
        JsonElement paramsEl = ((JsonObject) filterMap).get(PARAMS);
        Map<String, String> params = mapify(paramsEl);
        String spiName = "";
        for (String s : TokenFilterFactory.availableTokenFilters()) {
            Class clazz = TokenFilterFactory.lookupClass(s);
            if (clazz.getName().equals(factoryName)) {
                spiName = s;
                break;
            }
        }
        if (spiName.equals("")) {
            throw new IllegalArgumentException("A SPI class of type org.apache.lucene.analysis.util.TokenFilterFactory with name" + "'" + factoryName + "' does not exist.");
        }
        try {
            TokenFilterFactory tokenFilterFactory = TokenFilterFactory.forName(spiName, params);
            if (tokenFilterFactory instanceof ResourceLoaderAware) {
                ((ResourceLoaderAware) tokenFilterFactory).inform(new ClasspathResourceLoader(AnalyzerDeserializer.class));
            }
            ret.add(tokenFilterFactory);
        } catch (IllegalArgumentException e) {
            throw new IllegalArgumentException("While loading " + analyzerName, e);
        }
    }
    if (maxTokens > -1) {
        Map<String, String> m = new HashMap<>();
        m.put("maxTokenCount", Integer.toString(maxTokens));
        ret.add(new LimitTokenCountFilterFactory(m));
    }
    if (ret.size() == 0) {
        return new TokenFilterFactory[0];
    }
    return ret.toArray(new TokenFilterFactory[ret.size()]);
}
Also used : HashMap(java.util.HashMap) JsonObject(com.google.gson.JsonObject) LinkedList(java.util.LinkedList) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory) JsonArray(com.google.gson.JsonArray) JsonElement(com.google.gson.JsonElement) ClasspathResourceLoader(org.apache.lucene.analysis.util.ClasspathResourceLoader) ResourceLoaderAware(org.apache.lucene.analysis.util.ResourceLoaderAware) LimitTokenCountFilterFactory(org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory)

Example 24 with TokenFilterFactory

use of org.apache.lucene.analysis.util.TokenFilterFactory in project jackrabbit-oak by apache.

the class NodeStateAnalyzerFactory method loadTokenFilterFactories.

private TokenFilterFactory[] loadTokenFilterFactories(NodeState tokenFiltersState) {
    List<TokenFilterFactory> result = newArrayList();
    Tree tree = TreeFactory.createReadOnlyTree(tokenFiltersState);
    for (Tree t : tree.getChildren()) {
        NodeState state = tokenFiltersState.getChildNode(t.getName());
        String factoryType = getFactoryType(state, t.getName());
        Map<String, String> args = convertNodeState(state);
        TokenFilterFactory cf = TokenFilterFactory.forName(factoryType, args);
        init(cf, state);
        result.add(cf);
    }
    return result.toArray(new TokenFilterFactory[result.size()]);
}
Also used : NodeState(org.apache.jackrabbit.oak.spi.state.NodeState) Tree(org.apache.jackrabbit.oak.api.Tree) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory)

Example 25 with TokenFilterFactory

use of org.apache.lucene.analysis.util.TokenFilterFactory in project lucene-solr by apache.

the class CustomAnalyzer method normalize.

@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
    TokenStream result = in;
    for (TokenFilterFactory filter : tokenFilters) {
        if (filter instanceof MultiTermAwareComponent) {
            filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent();
            result = filter.create(result);
        }
    }
    return result;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) MultiTermAwareComponent(org.apache.lucene.analysis.util.MultiTermAwareComponent) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory)

Aggregations

TokenFilterFactory (org.apache.lucene.analysis.util.TokenFilterFactory)40 CharFilterFactory (org.apache.lucene.analysis.util.CharFilterFactory)16 Analyzer (org.apache.lucene.analysis.Analyzer)12 TokenizerChain (org.apache.solr.analysis.TokenizerChain)12 TokenizerFactory (org.apache.lucene.analysis.util.TokenizerFactory)11 TokenStream (org.apache.lucene.analysis.TokenStream)10 ArrayList (java.util.ArrayList)7 HashMap (java.util.HashMap)7 Tokenizer (org.apache.lucene.analysis.Tokenizer)6 MultiTermAwareComponent (org.apache.lucene.analysis.util.MultiTermAwareComponent)5 IOException (java.io.IOException)4 StringReader (java.io.StringReader)4 Test (org.junit.Test)4 Reader (java.io.Reader)3 Map (java.util.Map)3 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)3 KeywordTokenizerFactory (org.apache.lucene.analysis.core.KeywordTokenizerFactory)3 ResourceLoaderAware (org.apache.lucene.analysis.util.ResourceLoaderAware)3 SolrException (org.apache.solr.common.SolrException)3 JsonElement (com.google.gson.JsonElement)2