Search in sources :

Example 36 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project languagetool by languagetool-org.

the class PatternRuleQueryBuilderTest method setUp.

@Override
public void setUp() throws Exception {
    super.setUp();
    language = new English();
    directory = new RAMDirectory();
    /*File indexPath = new File("/tmp/lucene");
    if (indexPath.exists()) {
      FileUtils.deleteDirectory(indexPath);
    }
    directory = FSDirectory.open(indexPath);*/
    Analyzer analyzer = Indexer.getAnalyzer(language);
    IndexWriterConfig config = Indexer.getIndexWriterConfig(analyzer);
    try (IndexWriter writer = new IndexWriter(directory, config)) {
        addDocument(writer, "How do you thin about this wonderful idea?");
        addDocument(writer, "The are several grammar checkers for English, E.G. LanguageTool 123.");
    }
    reader = DirectoryReader.open(directory);
    searcher = newSearcher(reader);
}
Also used : English(org.languagetool.language.English) IndexWriter(org.apache.lucene.index.IndexWriter) Analyzer(org.apache.lucene.analysis.Analyzer) RAMDirectory(org.apache.lucene.store.RAMDirectory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 37 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project languagetool by languagetool-org.

the class SentenceSourceIndexer method main.

public static void main(String... args) throws Exception {
    if (args.length != 5) {
        System.out.println("Usage: " + SentenceSourceIndexer.class.getSimpleName() + " <dataFile...> <indexDir> <languageCode> <maxSentences> <indexPosTags>");
        System.out.println("\t<dataFiles> comma-separated list of a Wikipedia XML dump (*.xml) and/or Tatoeba files (tatoeba-*)");
        System.out.println("\t<indexDir> directory where Lucene index will be written to, existing index content will be removed");
        System.out.println("\t<languageCode> short code like en for English, de for German etc");
        System.out.println("\t<maxSentences> maximum number of sentences to be indexed, use 0 for no limit");
        System.out.println("\t<indexPosTags> 1 to also index POS tags (i.e. analyze text by LT), 0 to index only the plain text");
        System.exit(1);
    }
    List<String> dumpFilesNames = Arrays.asList(args[0].split(","));
    File indexDir = new File(args[1]);
    String languageCode = args[2];
    int maxSentences = Integer.parseInt(args[3]);
    Language language = Languages.getLanguageForShortCode(languageCode);
    if (maxSentences == 0) {
        System.out.println("Going to index contents from " + dumpFilesNames);
    } else {
        System.out.println("Going to index up to " + maxSentences + " sentences from " + dumpFilesNames);
    }
    System.out.println("Output index dir: " + indexDir);
    long start = System.currentTimeMillis();
    Analyzer analyzer;
    String indexPos = args[4];
    if (indexPos.equals("1")) {
        // this will use LanguageToolAnalyzer
        analyzer = null;
    } else if (indexPos.equals("0")) {
        analyzer = new StandardAnalyzer(new CharArraySet(Collections.emptyList(), false));
    } else {
        throw new IllegalArgumentException("Unknown value '" + indexPos + "' for indexPosTags parameter, use 0 or 1");
    }
    try (FSDirectory fsDirectory = FSDirectory.open(indexDir.toPath());
        SentenceSourceIndexer indexer = new SentenceSourceIndexer(fsDirectory, language, maxSentences, analyzer)) {
        try {
            indexer.run(dumpFilesNames, language);
        } catch (DocumentLimitReachedException e) {
            System.out.println("Sentence limit (" + e.getLimit() + ") reached, stopping indexing");
        } finally {
            indexer.writeMetaDocuments();
        }
        if (analyzer != null) {
            analyzer.close();
        }
    }
    long end = System.currentTimeMillis();
    float minutes = (end - start) / (float) (1000 * 60);
    System.out.printf("Indexing took %.2f minutes\n", minutes);
}
Also used : CharArraySet(org.apache.lucene.analysis.util.CharArraySet) FSDirectory(org.apache.lucene.store.FSDirectory) Analyzer(org.apache.lucene.analysis.Analyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) Language(org.languagetool.Language) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) File(java.io.File)

Example 38 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project neo4j-mobile-android by neo4j-contrib.

the class IndexType method getIndexType.

static IndexType getIndexType(IndexIdentifier identifier, Map<String, String> config) {
    String type = config.get(LuceneIndexImplementation.KEY_TYPE);
    IndexType result = null;
    Similarity similarity = getCustomSimilarity(config);
    boolean toLowerCase = parseBoolean(config.get(LuceneIndexImplementation.KEY_TO_LOWER_CASE), true);
    Analyzer customAnalyzer = getCustomAnalyzer(config);
    if (type != null) {
        // Use the built in alternatives... "exact" or "fulltext"
        if (type.equals("exact")) {
            result = EXACT;
        } else if (type.equals("fulltext")) {
            Analyzer analyzer = customAnalyzer;
            if (analyzer == null) {
                analyzer = toLowerCase ? LuceneDataSource.LOWER_CASE_WHITESPACE_ANALYZER : LuceneDataSource.WHITESPACE_ANALYZER;
            }
            result = new CustomType(analyzer, toLowerCase, similarity);
        }
    } else {
        // Use custom analyzer
        if (customAnalyzer == null) {
            throw new IllegalArgumentException("No 'type' was given (which can point out " + "built-in analyzers, such as 'exact' and 'fulltext')" + " and no 'analyzer' was given either (which can point out a custom " + Analyzer.class.getName() + " to use)");
        }
        result = new CustomType(customAnalyzer, toLowerCase, similarity);
    }
    return result;
}
Also used : Similarity(org.apache.lucene.search.Similarity) Analyzer(org.apache.lucene.analysis.Analyzer)

Example 39 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project orientdb by orientechnologies.

the class OLuceneAnalyzerFactory method buildAnalyzer.

private Analyzer buildAnalyzer(String analyzerFQN, Collection<String> stopwords) {
    try {
        final Class classAnalyzer = Class.forName(analyzerFQN);
        final Constructor constructor = classAnalyzer.getDeclaredConstructor(CharArraySet.class);
        return (Analyzer) constructor.newInstance(new CharArraySet(stopwords, true));
    } catch (ClassNotFoundException e) {
        throw OException.wrapException(new OIndexException("Analyzer: " + analyzerFQN + " not found"), e);
    } catch (NoSuchMethodException e) {
        throw OException.wrapException(new OIndexException("Couldn't instantiate analyzer:  public constructor  not found"), e);
    } catch (Exception e) {
        OLogManager.instance().error(this, "Error on getting analyzer for Lucene index", e);
    }
    return new StandardAnalyzer();
}
Also used : CharArraySet(org.apache.lucene.analysis.util.CharArraySet) OIndexException(com.orientechnologies.orient.core.index.OIndexException) Constructor(java.lang.reflect.Constructor) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) OIndexException(com.orientechnologies.orient.core.index.OIndexException) OException(com.orientechnologies.common.exception.OException)

Example 40 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project elasticsearch-suggest-plugin by spinscale.

the class AbstractCacheLoaderSuggester method load.

@Override
public T load(ShardSuggestService.FieldType fieldType) throws Exception {
    MapperService.SmartNameFieldMappers fieldMappers = mapperService.smartName(fieldType.field(), fieldType.types());
    Analyzer queryAnalyzer = null;
    Analyzer indexAnalyzer = null;
    if (fieldMappers != null) {
        FieldMapper fieldMapper = mapperService.smartName(fieldType.field(), fieldType.types()).mapper();
        queryAnalyzer = fieldMapper.searchAnalyzer();
        if (Strings.hasLength(fieldType.indexAnalyzer())) {
            NamedAnalyzer namedAnalyzer = analysisService.analyzer(fieldType.queryAnalyzer());
            if (namedAnalyzer == null) {
                throw new ElasticsearchException("Query analyzer[" + fieldType.queryAnalyzer() + "] does not exist.");
            }
            queryAnalyzer = namedAnalyzer.analyzer();
        }
        indexAnalyzer = fieldMapper.searchAnalyzer();
        if (Strings.hasLength(fieldType.indexAnalyzer())) {
            NamedAnalyzer namedAnalyzer = analysisService.analyzer(fieldType.indexAnalyzer());
            if (namedAnalyzer == null) {
                throw new ElasticsearchException("Index analyzer[" + fieldType.indexAnalyzer() + "] does not exist.");
            }
            indexAnalyzer = namedAnalyzer.analyzer();
        }
    }
    if (queryAnalyzer == null) {
        queryAnalyzer = new StandardAnalyzer(org.elasticsearch.Version.CURRENT.luceneVersion);
    }
    if (indexAnalyzer == null) {
        indexAnalyzer = new StandardAnalyzer(org.elasticsearch.Version.CURRENT.luceneVersion);
    }
    return getSuggester(indexAnalyzer, queryAnalyzer, fieldType);
}
Also used : NamedAnalyzer(org.elasticsearch.index.analysis.NamedAnalyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) ElasticsearchException(org.elasticsearch.ElasticsearchException) Analyzer(org.apache.lucene.analysis.Analyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) NamedAnalyzer(org.elasticsearch.index.analysis.NamedAnalyzer) FieldMapper(org.elasticsearch.index.mapper.FieldMapper) MapperService(org.elasticsearch.index.mapper.MapperService)

Aggregations

Analyzer (org.apache.lucene.analysis.Analyzer)1020 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)396 Tokenizer (org.apache.lucene.analysis.Tokenizer)265 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)228 Document (org.apache.lucene.document.Document)207 Directory (org.apache.lucene.store.Directory)192 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)176 BytesRef (org.apache.lucene.util.BytesRef)122 Test (org.junit.Test)119 TokenStream (org.apache.lucene.analysis.TokenStream)107 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)92 Term (org.apache.lucene.index.Term)92 IndexReader (org.apache.lucene.index.IndexReader)67 InputArrayIterator (org.apache.lucene.search.suggest.InputArrayIterator)65 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)64 Input (org.apache.lucene.search.suggest.Input)63 CharArraySet (org.apache.lucene.analysis.CharArraySet)58 ArrayList (java.util.ArrayList)57 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)57 TextField (org.apache.lucene.document.TextField)55