use of org.apache.lucene.analysis.util.CharArraySet in project languagetool by languagetool-org.
the class SentenceSourceIndexer method main.
public static void main(String... args) throws Exception {
if (args.length != 5) {
System.out.println("Usage: " + SentenceSourceIndexer.class.getSimpleName() + " <dataFile...> <indexDir> <languageCode> <maxSentences> <indexPosTags>");
System.out.println("\t<dataFiles> comma-separated list of a Wikipedia XML dump (*.xml) and/or Tatoeba files (tatoeba-*)");
System.out.println("\t<indexDir> directory where Lucene index will be written to, existing index content will be removed");
System.out.println("\t<languageCode> short code like en for English, de for German etc");
System.out.println("\t<maxSentences> maximum number of sentences to be indexed, use 0 for no limit");
System.out.println("\t<indexPosTags> 1 to also index POS tags (i.e. analyze text by LT), 0 to index only the plain text");
System.exit(1);
}
List<String> dumpFilesNames = Arrays.asList(args[0].split(","));
File indexDir = new File(args[1]);
String languageCode = args[2];
int maxSentences = Integer.parseInt(args[3]);
Language language = Languages.getLanguageForShortCode(languageCode);
if (maxSentences == 0) {
System.out.println("Going to index contents from " + dumpFilesNames);
} else {
System.out.println("Going to index up to " + maxSentences + " sentences from " + dumpFilesNames);
}
System.out.println("Output index dir: " + indexDir);
long start = System.currentTimeMillis();
Analyzer analyzer;
String indexPos = args[4];
if (indexPos.equals("1")) {
// this will use LanguageToolAnalyzer
analyzer = null;
} else if (indexPos.equals("0")) {
analyzer = new StandardAnalyzer(new CharArraySet(Collections.emptyList(), false));
} else {
throw new IllegalArgumentException("Unknown value '" + indexPos + "' for indexPosTags parameter, use 0 or 1");
}
try (FSDirectory fsDirectory = FSDirectory.open(indexDir.toPath());
SentenceSourceIndexer indexer = new SentenceSourceIndexer(fsDirectory, language, maxSentences, analyzer)) {
try {
indexer.run(dumpFilesNames, language);
} catch (DocumentLimitReachedException e) {
System.out.println("Sentence limit (" + e.getLimit() + ") reached, stopping indexing");
} finally {
indexer.writeMetaDocuments();
}
if (analyzer != null) {
analyzer.close();
}
}
long end = System.currentTimeMillis();
float minutes = (end - start) / (float) (1000 * 60);
System.out.printf("Indexing took %.2f minutes\n", minutes);
}
use of org.apache.lucene.analysis.util.CharArraySet in project jackrabbit-oak by apache.
the class NodeStateAnalyzerFactory method createAnalyzerViaReflection.
private Analyzer createAnalyzerViaReflection(NodeState state) {
String clazz = state.getString(LuceneIndexConstants.ANL_CLASS);
Class<? extends Analyzer> analyzerClazz = defaultLoader.findClass(clazz, Analyzer.class);
Version matchVersion = getVersion(state);
CharArraySet stopwords = null;
if (StopwordAnalyzerBase.class.isAssignableFrom(analyzerClazz) && state.hasChildNode(LuceneIndexConstants.ANL_STOPWORDS)) {
try {
stopwords = loadStopwordSet(state.getChildNode(LuceneIndexConstants.ANL_STOPWORDS), LuceneIndexConstants.ANL_STOPWORDS, matchVersion);
} catch (IOException e) {
throw new RuntimeException("Error occurred while loading stopwords", e);
}
}
Constructor<? extends Analyzer> c = null;
try {
if (stopwords != null) {
c = analyzerClazz.getConstructor(Version.class, CharArraySet.class);
return c.newInstance(matchVersion, stopwords);
} else {
c = analyzerClazz.getConstructor(Version.class);
return c.newInstance(matchVersion);
}
} catch (NoSuchMethodException e) {
throw new RuntimeException("Error occurred while instantiating Analyzer for " + analyzerClazz, e);
} catch (InstantiationException e) {
throw new RuntimeException("Error occurred while instantiating Analyzer for " + analyzerClazz, e);
} catch (IllegalAccessException e) {
throw new RuntimeException("Error occurred while instantiating Analyzer for " + analyzerClazz, e);
} catch (InvocationTargetException e) {
throw new RuntimeException("Error occurred while instantiating Analyzer for " + analyzerClazz, e);
}
}
use of org.apache.lucene.analysis.util.CharArraySet in project Vidyavana by borsosl.
the class HtmlAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new TransliterationTokenizer();
TokenStream filter = new StopFilter(tokenizer, new CharArraySet(Arrays.asList("a", "az", "és"), false));
filter = new TransliterationSynonymFilter(filter);
return new TokenStreamComponents(tokenizer, filter);
}
use of org.apache.lucene.analysis.util.CharArraySet in project vertigo by KleeGroup.
the class DefaultAnalyzer method createComponents.
/**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*
* @return A TokenStream build from a StandardTokenizer filtered with
* StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
*/
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
/* initialisation du token */
final Tokenizer source = new StandardTokenizer();
// -----
/* on retire les élisions*/
final CharArraySet elisionSet = new CharArraySet(Arrays.asList(LuceneConstants.ELISION_ARTICLES), true);
TokenStream filter = new ElisionFilter(source, elisionSet);
/* on retire article adjectif */
filter = new StopFilter(filter, stopWords);
/* on retire les accents */
filter = new ASCIIFoldingFilter(filter);
/* on met en minuscule */
filter = new LowerCaseFilter(filter);
return new TokenStreamComponents(source, filter);
}
use of org.apache.lucene.analysis.util.CharArraySet in project orientdb by orientechnologies.
the class OLuceneAnalyzerFactory method buildAnalyzer.
private Analyzer buildAnalyzer(String analyzerFQN, Collection<String> stopwords) {
try {
final Class classAnalyzer = Class.forName(analyzerFQN);
final Constructor constructor = classAnalyzer.getDeclaredConstructor(CharArraySet.class);
return (Analyzer) constructor.newInstance(new CharArraySet(stopwords, true));
} catch (ClassNotFoundException e) {
throw OException.wrapException(new OIndexException("Analyzer: " + analyzerFQN + " not found"), e);
} catch (NoSuchMethodException e) {
throw OException.wrapException(new OIndexException("Couldn't instantiate analyzer: public constructor not found"), e);
} catch (Exception e) {
OLogManager.instance().error(this, "Error on getting analyzer for Lucene index", e);
}
return new StandardAnalyzer();
}
Aggregations