Examples with TokenizerFactory - org.apache.lucene.analysis.util.TokenizerFactory

Example 16 with TokenizerFactory

use of org.apache.lucene.analysis.util.TokenizerFactory in project lucene-solr by apache.

the class TestAllAnalyzersHaveFactories method test.

public void test() throws Exception {
    List<Class<?>> analysisClasses = TestRandomChains.getClassesForPackage("org.apache.lucene.analysis");
    for (final Class<?> c : analysisClasses) {
        final int modifiers = c.getModifiers();
        if (// don't waste time with abstract classes
        Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers) || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface() || testComponents.contains(c) || crazyComponents.contains(c) || oddlyNamedComponents.contains(c) || tokenFiltersWithoutFactory.contains(c) || // deprecated ones are typically back compat hacks
        c.isAnnotationPresent(Deprecated.class) || !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharFilter.class.isAssignableFrom(c))) {
            continue;
        }
        Map<String, String> args = new HashMap<>();
        args.put("luceneMatchVersion", Version.LATEST.toString());
        if (Tokenizer.class.isAssignableFrom(c)) {
            String clazzName = c.getSimpleName();
            assertTrue(clazzName.endsWith("Tokenizer"));
            String simpleName = clazzName.substring(0, clazzName.length() - 9);
            assertNotNull(TokenizerFactory.lookupClass(simpleName));
            TokenizerFactory instance = null;
            try {
                instance = TokenizerFactory.forName(simpleName, args);
                assertNotNull(instance);
                if (instance instanceof ResourceLoaderAware) {
                    ((ResourceLoaderAware) instance).inform(loader);
                }
                assertSame(c, instance.create().getClass());
            } catch (IllegalArgumentException e) {
            // TODO: For now pass because some factories have not yet a default config that always works
            }
        } else if (TokenFilter.class.isAssignableFrom(c)) {
            String clazzName = c.getSimpleName();
            assertTrue(clazzName.endsWith("Filter"));
            String simpleName = clazzName.substring(0, clazzName.length() - (clazzName.endsWith("TokenFilter") ? 11 : 6));
            assertNotNull(TokenFilterFactory.lookupClass(simpleName));
            TokenFilterFactory instance = null;
            try {
                instance = TokenFilterFactory.forName(simpleName, args);
                assertNotNull(instance);
                if (instance instanceof ResourceLoaderAware) {
                    ((ResourceLoaderAware) instance).inform(loader);
                }
                Class<? extends TokenStream> createdClazz = instance.create(new KeywordTokenizer()).getClass();
                // only check instance if factory have wrapped at all!
                if (KeywordTokenizer.class != createdClazz) {
                    assertSame(c, createdClazz);
                }
            } catch (IllegalArgumentException e) {
            // TODO: For now pass because some factories have not yet a default config that always works
            }
        } else if (CharFilter.class.isAssignableFrom(c)) {
            String clazzName = c.getSimpleName();
            assertTrue(clazzName.endsWith("CharFilter"));
            String simpleName = clazzName.substring(0, clazzName.length() - 10);
            assertNotNull(CharFilterFactory.lookupClass(simpleName));
            CharFilterFactory instance = null;
            try {
                instance = CharFilterFactory.forName(simpleName, args);
                assertNotNull(instance);
                if (instance instanceof ResourceLoaderAware) {
                    ((ResourceLoaderAware) instance).inform(loader);
                }
                Class<? extends Reader> createdClazz = instance.create(new StringReader("")).getClass();
                // only check instance if factory have wrapped at all!
                if (StringReader.class != createdClazz) {
                    assertSame(c, createdClazz);
                }
            } catch (IllegalArgumentException e) {
            // TODO: For now pass because some factories have not yet a default config that always works
            }
        }
    }
}

Also used : TokenStream(org.apache.lucene.analysis.TokenStream) TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) HashMap(java.util.HashMap) IdentityHashMap(java.util.IdentityHashMap) CharFilterFactory(org.apache.lucene.analysis.util.CharFilterFactory) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory) StringReader(java.io.StringReader) ResourceLoaderAware(org.apache.lucene.analysis.util.ResourceLoaderAware) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) ReversePathHierarchyTokenizer(org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer) TeeSinkTokenFilter(org.apache.lucene.analysis.sinks.TeeSinkTokenFilter) CachingTokenFilter(org.apache.lucene.analysis.CachingTokenFilter) MockGraphTokenFilter(org.apache.lucene.analysis.MockGraphTokenFilter) ValidatingTokenFilter(org.apache.lucene.analysis.ValidatingTokenFilter) MockRandomLookaheadTokenFilter(org.apache.lucene.analysis.MockRandomLookaheadTokenFilter) CrankyTokenFilter(org.apache.lucene.analysis.CrankyTokenFilter) MockTokenFilter(org.apache.lucene.analysis.MockTokenFilter) TokenFilter(org.apache.lucene.analysis.TokenFilter) MockHoleInjectingTokenFilter(org.apache.lucene.analysis.MockHoleInjectingTokenFilter)

Example 17 with TokenizerFactory

use of org.apache.lucene.analysis.util.TokenizerFactory in project lucene-solr by apache.

the class AnalyzerFactoryTask method createAnalysisPipelineComponent.

/**
   * Instantiates the given analysis factory class after pulling params from
   * the given stream tokenizer, then stores the result in the appropriate
   * pipeline component list.
   *
   * @param stok stream tokenizer from which to draw analysis factory params
   * @param clazz analysis factory class to instantiate
   */
@SuppressWarnings("fallthrough")
private void createAnalysisPipelineComponent(StreamTokenizer stok, Class<? extends AbstractAnalysisFactory> clazz) {
    Map<String, String> argMap = new HashMap<>();
    boolean parenthetical = false;
    try {
        WHILE_LOOP: while (stok.nextToken() != StreamTokenizer.TT_EOF) {
            switch(stok.ttype) {
                case ',':
                    {
                        if (parenthetical) {
                            // Do nothing
                            break;
                        } else {
                            // Finished reading this analysis factory configuration
                            break WHILE_LOOP;
                        }
                    }
                case '(':
                    {
                        if (parenthetical) {
                            throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected opening parenthesis.");
                        }
                        parenthetical = true;
                        break;
                    }
                case ')':
                    {
                        if (parenthetical) {
                            parenthetical = false;
                        } else {
                            throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected closing parenthesis.");
                        }
                        break;
                    }
                case StreamTokenizer.TT_WORD:
                    {
                        if (!parenthetical) {
                            throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected token '" + stok.sval + "'");
                        }
                        String argName = stok.sval;
                        stok.nextToken();
                        if (stok.ttype != ':') {
                            throw new RuntimeException("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to " + clazz.getSimpleName());
                        }
                        stok.nextToken();
                        String argValue = stok.sval;
                        switch(stok.ttype) {
                            case StreamTokenizer.TT_NUMBER:
                                {
                                    argValue = Double.toString(stok.nval);
                                    // Drop the ".0" from numbers, for integer arguments
                                    argValue = TRAILING_DOT_ZERO_PATTERN.matcher(argValue).replaceFirst("");
                                // Intentional fall-through
                                }
                            case '"':
                            case '\'':
                            case StreamTokenizer.TT_WORD:
                                {
                                    argMap.put(argName, argValue);
                                    break;
                                }
                            case StreamTokenizer.TT_EOF:
                                {
                                    throw new RuntimeException("Unexpected EOF: " + stok.toString());
                                }
                            default:
                                {
                                    throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
                                }
                        }
                    }
            }
        }
        if (!argMap.containsKey("luceneMatchVersion")) {
            argMap.put("luceneMatchVersion", Version.LATEST.toString());
        }
        final AbstractAnalysisFactory instance;
        try {
            instance = clazz.getConstructor(Map.class).newInstance(argMap);
        } catch (Exception e) {
            throw new RuntimeException("Line #" + lineno(stok) + ": ", e);
        }
        if (instance instanceof ResourceLoaderAware) {
            Path baseDir = Paths.get(getRunData().getConfig().get("work.dir", "work"));
            if (!Files.isDirectory(baseDir)) {
                baseDir = Paths.get(".");
            }
            ((ResourceLoaderAware) instance).inform(new FilesystemResourceLoader(baseDir));
        }
        if (CharFilterFactory.class.isAssignableFrom(clazz)) {
            charFilterFactories.add((CharFilterFactory) instance);
        } else if (TokenizerFactory.class.isAssignableFrom(clazz)) {
            tokenizerFactory = (TokenizerFactory) instance;
        } else if (TokenFilterFactory.class.isAssignableFrom(clazz)) {
            tokenFilterFactories.add((TokenFilterFactory) instance);
        }
    } catch (RuntimeException e) {
        if (e.getMessage().startsWith("Line #")) {
            throw (e);
        } else {
            throw new RuntimeException("Line #" + lineno(stok) + ": ", e);
        }
    } catch (Throwable t) {
        throw new RuntimeException("Line #" + lineno(stok) + ": ", t);
    }
}

Also used : Path(java.nio.file.Path) FilesystemResourceLoader(org.apache.lucene.analysis.util.FilesystemResourceLoader) TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) HashMap(java.util.HashMap) ResourceLoaderAware(org.apache.lucene.analysis.util.ResourceLoaderAware) AbstractAnalysisFactory(org.apache.lucene.analysis.util.AbstractAnalysisFactory)

Example 18 with TokenizerFactory

use of org.apache.lucene.analysis.util.TokenizerFactory in project tika by apache.

the class AnalyzerDeserializer method buildAnalyzer.

public static Analyzer buildAnalyzer(String analyzerName, JsonElement value, int maxTokens) throws IOException {
    if (!value.isJsonObject()) {
        throw new IllegalArgumentException("Expecting map of charfilter, tokenizer, tokenfilters");
    }
    JsonObject aRoot = (JsonObject) value;
    CharFilterFactory[] charFilters = new CharFilterFactory[0];
    TokenizerFactory tokenizerFactory = null;
    TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
    for (Map.Entry<String, JsonElement> e : aRoot.entrySet()) {
        String k = e.getKey();
        if (k.equals(CHAR_FILTERS)) {
            charFilters = buildCharFilters(e.getValue(), analyzerName);
        } else if (k.equals(TOKEN_FILTERS)) {
            tokenFilterFactories = buildTokenFilterFactories(e.getValue(), analyzerName, maxTokens);
        } else if (k.equals(TOKENIZER)) {
            tokenizerFactory = buildTokenizerFactory(e.getValue(), analyzerName);
        } else if (!k.equals(COMMENT)) {
            throw new IllegalArgumentException("Should have one of three values here:" + CHAR_FILTERS + ", " + TOKENIZER + ", " + TOKEN_FILTERS + ". I don't recognize: " + k);
        }
    }
    if (tokenizerFactory == null) {
        throw new IllegalArgumentException("Must specify at least a tokenizer factory for an analyzer!");
    }
    return new MyTokenizerChain(charFilters, tokenizerFactory, tokenFilterFactories);
}

Also used : TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) JsonElement(com.google.gson.JsonElement) CharFilterFactory(org.apache.lucene.analysis.util.CharFilterFactory) JsonObject(com.google.gson.JsonObject) HashMap(java.util.HashMap) Map(java.util.Map) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory)

Example 19 with TokenizerFactory

use of org.apache.lucene.analysis.util.TokenizerFactory in project tika by apache.

the class AnalyzerDeserializer method buildTokenizerFactory.

private static TokenizerFactory buildTokenizerFactory(JsonElement map, String analyzerName) throws IOException {
    if (!(map instanceof JsonObject)) {
        throw new IllegalArgumentException("Expecting a map with \"factory\" string and " + "\"params\" map in tokenizer factory;" + " not: " + map.toString() + " in " + analyzerName);
    }
    JsonElement factoryEl = ((JsonObject) map).get(FACTORY);
    if (factoryEl == null || !factoryEl.isJsonPrimitive()) {
        throw new IllegalArgumentException("Expecting value for factory in char filter factory builder in:" + analyzerName);
    }
    String factoryName = factoryEl.getAsString();
    factoryName = factoryName.startsWith("oala.") ? factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.") : factoryName;
    JsonElement paramsEl = ((JsonObject) map).get(PARAMS);
    Map<String, String> params = mapify(paramsEl);
    String spiName = "";
    for (String s : TokenizerFactory.availableTokenizers()) {
        Class clazz = TokenizerFactory.lookupClass(s);
        if (clazz.getName().equals(factoryName)) {
            spiName = s;
            break;
        }
    }
    if (spiName.equals("")) {
        throw new IllegalArgumentException("A SPI class of type org.apache.lucene.analysis.util.TokenizerFactory with name" + "'" + factoryName + "' does not exist.");
    }
    try {
        TokenizerFactory tokenizerFactory = TokenizerFactory.forName(spiName, params);
        if (tokenizerFactory instanceof ResourceLoaderAware) {
            ((ResourceLoaderAware) tokenizerFactory).inform(new ClasspathResourceLoader(AnalyzerDeserializer.class));
        }
        return tokenizerFactory;
    } catch (IllegalArgumentException e) {
        throw new IllegalArgumentException("While working on " + analyzerName, e);
    }
}

Also used : TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) JsonElement(com.google.gson.JsonElement) JsonObject(com.google.gson.JsonObject) ClasspathResourceLoader(org.apache.lucene.analysis.util.ClasspathResourceLoader) ResourceLoaderAware(org.apache.lucene.analysis.util.ResourceLoaderAware)

Aggregations

TokenizerFactory (org.apache.lucene.analysis.util.TokenizerFactory)19 CharFilterFactory (org.apache.lucene.analysis.util.CharFilterFactory)12 TokenFilterFactory (org.apache.lucene.analysis.util.TokenFilterFactory)11 Analyzer (org.apache.lucene.analysis.Analyzer)7 Tokenizer (org.apache.lucene.analysis.Tokenizer)6 ArrayList (java.util.ArrayList)5 HashMap (java.util.HashMap)5 TokenStream (org.apache.lucene.analysis.TokenStream)5 TokenizerChain (org.apache.solr.analysis.TokenizerChain)5 IOException (java.io.IOException)4 StringReader (java.io.StringReader)4 ResourceLoaderAware (org.apache.lucene.analysis.util.ResourceLoaderAware)4 Map (java.util.Map)3 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)3 AbstractAnalysisFactory (org.apache.lucene.analysis.util.AbstractAnalysisFactory)3 SolrException (org.apache.solr.common.SolrException)3 JsonElement (com.google.gson.JsonElement)2 JsonObject (com.google.gson.JsonObject)2 Reader (java.io.Reader)2 ParseException (java.text.ParseException)2