Search in sources :

Example 1 with CharArraySet

use of org.apache.lucene.analysis.util.CharArraySet in project languagetool by languagetool-org.

the class SentenceSourceIndexer method main.

public static void main(String... args) throws Exception {
    if (args.length != 5) {
        System.out.println("Usage: " + SentenceSourceIndexer.class.getSimpleName() + " <dataFile...> <indexDir> <languageCode> <maxSentences> <indexPosTags>");
        System.out.println("\t<dataFiles> comma-separated list of a Wikipedia XML dump (*.xml) and/or Tatoeba files (tatoeba-*)");
        System.out.println("\t<indexDir> directory where Lucene index will be written to, existing index content will be removed");
        System.out.println("\t<languageCode> short code like en for English, de for German etc");
        System.out.println("\t<maxSentences> maximum number of sentences to be indexed, use 0 for no limit");
        System.out.println("\t<indexPosTags> 1 to also index POS tags (i.e. analyze text by LT), 0 to index only the plain text");
        System.exit(1);
    }
    List<String> dumpFilesNames = Arrays.asList(args[0].split(","));
    File indexDir = new File(args[1]);
    String languageCode = args[2];
    int maxSentences = Integer.parseInt(args[3]);
    Language language = Languages.getLanguageForShortCode(languageCode);
    if (maxSentences == 0) {
        System.out.println("Going to index contents from " + dumpFilesNames);
    } else {
        System.out.println("Going to index up to " + maxSentences + " sentences from " + dumpFilesNames);
    }
    System.out.println("Output index dir: " + indexDir);
    long start = System.currentTimeMillis();
    Analyzer analyzer;
    String indexPos = args[4];
    if (indexPos.equals("1")) {
        // this will use LanguageToolAnalyzer
        analyzer = null;
    } else if (indexPos.equals("0")) {
        analyzer = new StandardAnalyzer(new CharArraySet(Collections.emptyList(), false));
    } else {
        throw new IllegalArgumentException("Unknown value '" + indexPos + "' for indexPosTags parameter, use 0 or 1");
    }
    try (FSDirectory fsDirectory = FSDirectory.open(indexDir.toPath());
        SentenceSourceIndexer indexer = new SentenceSourceIndexer(fsDirectory, language, maxSentences, analyzer)) {
        try {
            indexer.run(dumpFilesNames, language);
        } catch (DocumentLimitReachedException e) {
            System.out.println("Sentence limit (" + e.getLimit() + ") reached, stopping indexing");
        } finally {
            indexer.writeMetaDocuments();
        }
        if (analyzer != null) {
            analyzer.close();
        }
    }
    long end = System.currentTimeMillis();
    float minutes = (end - start) / (float) (1000 * 60);
    System.out.printf("Indexing took %.2f minutes\n", minutes);
}
Also used : CharArraySet(org.apache.lucene.analysis.util.CharArraySet) FSDirectory(org.apache.lucene.store.FSDirectory) Analyzer(org.apache.lucene.analysis.Analyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) Language(org.languagetool.Language) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) File(java.io.File)

Example 2 with CharArraySet

use of org.apache.lucene.analysis.util.CharArraySet in project jackrabbit-oak by apache.

the class NodeStateAnalyzerFactory method createAnalyzerViaReflection.

private Analyzer createAnalyzerViaReflection(NodeState state) {
    String clazz = state.getString(LuceneIndexConstants.ANL_CLASS);
    Class<? extends Analyzer> analyzerClazz = defaultLoader.findClass(clazz, Analyzer.class);
    Version matchVersion = getVersion(state);
    CharArraySet stopwords = null;
    if (StopwordAnalyzerBase.class.isAssignableFrom(analyzerClazz) && state.hasChildNode(LuceneIndexConstants.ANL_STOPWORDS)) {
        try {
            stopwords = loadStopwordSet(state.getChildNode(LuceneIndexConstants.ANL_STOPWORDS), LuceneIndexConstants.ANL_STOPWORDS, matchVersion);
        } catch (IOException e) {
            throw new RuntimeException("Error occurred while loading stopwords", e);
        }
    }
    Constructor<? extends Analyzer> c = null;
    try {
        if (stopwords != null) {
            c = analyzerClazz.getConstructor(Version.class, CharArraySet.class);
            return c.newInstance(matchVersion, stopwords);
        } else {
            c = analyzerClazz.getConstructor(Version.class);
            return c.newInstance(matchVersion);
        }
    } catch (NoSuchMethodException e) {
        throw new RuntimeException("Error occurred while instantiating Analyzer for " + analyzerClazz, e);
    } catch (InstantiationException e) {
        throw new RuntimeException("Error occurred while instantiating Analyzer for " + analyzerClazz, e);
    } catch (IllegalAccessException e) {
        throw new RuntimeException("Error occurred while instantiating Analyzer for " + analyzerClazz, e);
    } catch (InvocationTargetException e) {
        throw new RuntimeException("Error occurred while instantiating Analyzer for " + analyzerClazz, e);
    }
}
Also used : CharArraySet(org.apache.lucene.analysis.util.CharArraySet) Version(org.apache.lucene.util.Version) StopwordAnalyzerBase(org.apache.lucene.analysis.util.StopwordAnalyzerBase) IOException(java.io.IOException) InvocationTargetException(java.lang.reflect.InvocationTargetException)

Example 3 with CharArraySet

use of org.apache.lucene.analysis.util.CharArraySet in project Vidyavana by borsosl.

the class HtmlAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer tokenizer = new TransliterationTokenizer();
    TokenStream filter = new StopFilter(tokenizer, new CharArraySet(Arrays.asList("a", "az", "és"), false));
    filter = new TransliterationSynonymFilter(filter);
    return new TokenStreamComponents(tokenizer, filter);
}
Also used : CharArraySet(org.apache.lucene.analysis.util.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) StopFilter(org.apache.lucene.analysis.core.StopFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 4 with CharArraySet

use of org.apache.lucene.analysis.util.CharArraySet in project vertigo by KleeGroup.

the class DefaultAnalyzer method createComponents.

/**
 * Creates a TokenStream which tokenizes all the text in the provided Reader.
 *
 * @return A TokenStream build from a StandardTokenizer filtered with
 *         StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
 */
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    /* initialisation du token */
    final Tokenizer source = new StandardTokenizer();
    // -----
    /* on retire les élisions*/
    final CharArraySet elisionSet = new CharArraySet(Arrays.asList(LuceneConstants.ELISION_ARTICLES), true);
    TokenStream filter = new ElisionFilter(source, elisionSet);
    /* on retire article adjectif */
    filter = new StopFilter(filter, stopWords);
    /* on retire les accents */
    filter = new ASCIIFoldingFilter(filter);
    /* on met en minuscule */
    filter = new LowerCaseFilter(filter);
    return new TokenStreamComponents(source, filter);
}
Also used : CharArraySet(org.apache.lucene.analysis.util.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) ElisionFilter(org.apache.lucene.analysis.util.ElisionFilter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.core.StopFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Example 5 with CharArraySet

use of org.apache.lucene.analysis.util.CharArraySet in project orientdb by orientechnologies.

the class OLuceneAnalyzerFactory method buildAnalyzer.

private Analyzer buildAnalyzer(String analyzerFQN, Collection<String> stopwords) {
    try {
        final Class classAnalyzer = Class.forName(analyzerFQN);
        final Constructor constructor = classAnalyzer.getDeclaredConstructor(CharArraySet.class);
        return (Analyzer) constructor.newInstance(new CharArraySet(stopwords, true));
    } catch (ClassNotFoundException e) {
        throw OException.wrapException(new OIndexException("Analyzer: " + analyzerFQN + " not found"), e);
    } catch (NoSuchMethodException e) {
        throw OException.wrapException(new OIndexException("Couldn't instantiate analyzer:  public constructor  not found"), e);
    } catch (Exception e) {
        OLogManager.instance().error(this, "Error on getting analyzer for Lucene index", e);
    }
    return new StandardAnalyzer();
}
Also used : CharArraySet(org.apache.lucene.analysis.util.CharArraySet) OIndexException(com.orientechnologies.orient.core.index.OIndexException) Constructor(java.lang.reflect.Constructor) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) OIndexException(com.orientechnologies.orient.core.index.OIndexException) OException(com.orientechnologies.common.exception.OException)

Aggregations

CharArraySet (org.apache.lucene.analysis.util.CharArraySet)12 StringReader (java.io.StringReader)6 TokenStream (org.apache.lucene.analysis.TokenStream)5 Analyzer (org.apache.lucene.analysis.Analyzer)4 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)4 IOException (java.io.IOException)3 Tokenizer (org.apache.lucene.analysis.Tokenizer)3 StopFilter (org.apache.lucene.analysis.core.StopFilter)3 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)2 OException (com.orientechnologies.common.exception.OException)1 OIndexException (com.orientechnologies.orient.core.index.OIndexException)1 DataflowException (edu.uci.ics.texera.api.exception.DataflowException)1 TexeraException (edu.uci.ics.texera.api.exception.TexeraException)1 DataFlowException (edu.uci.ics.textdb.api.exception.DataFlowException)1 File (java.io.File)1 Constructor (java.lang.reflect.Constructor)1 InvocationTargetException (java.lang.reflect.InvocationTargetException)1 ArrayList (java.util.ArrayList)1 ArabicAnalyzer (org.apache.lucene.analysis.ar.ArabicAnalyzer)1 LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)1