Search in sources :

Example 11 with PerFieldAnalyzerWrapper

use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project ansj_seg by NLPchina.

the class IndexTest method indexTest.

@Test
public void indexTest() throws CorruptIndexException, LockObtainFailedException, IOException, ParseException {
    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new AnsjAnalyzer(TYPE.index_ansj));
    Directory directory = null;
    IndexWriter iwriter = null;
    IndexWriterConfig ic = new IndexWriterConfig(analyzer);
    // 建立内存索引对象
    directory = new RAMDirectory();
    iwriter = new IndexWriter(directory, ic);
    addContent(iwriter, "助推企业转型升级提供强有力的技术支持和服保障。中心的建成将使青岛的服务器承载能力突破10万台,达到世界一流水平。");
    addContent(iwriter, "涉及民生的部分商品和服务成本监审政策");
    addContent(iwriter, "我穿着和服");
    iwriter.commit();
    iwriter.close();
    System.out.println("索引建立完毕");
    Analyzer queryAnalyzer = new AnsjAnalyzer(AnsjAnalyzer.TYPE.dic_ansj);
    System.out.println("index ok to search!");
    search(queryAnalyzer, directory, "\"和服\"");
}
Also used : AnsjAnalyzer(org.ansj.lucene7.AnsjAnalyzer) AnsjAnalyzer(org.ansj.lucene7.AnsjAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) RAMDirectory(org.apache.lucene.store.RAMDirectory) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory) Test(org.junit.Test)

Example 12 with PerFieldAnalyzerWrapper

use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project jena by apache.

the class TextIndexLucene method getQueryAnalyzer.

private Analyzer getQueryAnalyzer(boolean usingSearchFor, String lang) {
    if (usingSearchFor) {
        Analyzer qa = multilingualQueryAnalyzers.get(lang);
        if (qa == null) {
            qa = new PerFieldAnalyzerWrapper(new QueryMultilingualAnalyzer(defaultAnalyzer, lang), analyzerPerField);
            multilingualQueryAnalyzers.put(lang, qa);
        }
        return qa;
    } else {
        return queryAnalyzer;
    }
}
Also used : QueryMultilingualAnalyzer(org.apache.jena.query.text.analyzer.QueryMultilingualAnalyzer) KeywordAnalyzer(org.apache.lucene.analysis.core.KeywordAnalyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) IndexingMultilingualAnalyzer(org.apache.jena.query.text.analyzer.IndexingMultilingualAnalyzer) MultilingualAnalyzer(org.apache.jena.query.text.analyzer.MultilingualAnalyzer) QueryMultilingualAnalyzer(org.apache.jena.query.text.analyzer.QueryMultilingualAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper)

Example 13 with PerFieldAnalyzerWrapper

use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project Anserini by castorini.

the class IndexVectors method main.

public static void main(String[] args) throws Exception {
    IndexVectors.Args indexArgs = new IndexVectors.Args();
    CmdLineParser parser = new CmdLineParser(indexArgs, ParserProperties.defaults().withUsageWidth(90));
    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: " + IndexVectors.class.getSimpleName() + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }
    Analyzer vectorAnalyzer;
    if (indexArgs.encoding.equalsIgnoreCase(FW)) {
        vectorAnalyzer = new FakeWordsEncoderAnalyzer(indexArgs.q);
    } else if (indexArgs.encoding.equalsIgnoreCase(LEXLSH)) {
        vectorAnalyzer = new LexicalLshAnalyzer(indexArgs.decimals, indexArgs.ngrams, indexArgs.hashCount, indexArgs.bucketCount, indexArgs.hashSetSize);
    } else {
        parser.printUsage(System.err);
        System.err.println("Example: " + IndexVectors.class.getSimpleName() + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }
    final long start = System.nanoTime();
    System.out.println(String.format("Loading model %s", indexArgs.input));
    Map<String, List<float[]>> vectors = readGloVe(indexArgs.input);
    Path indexDir = indexArgs.path;
    if (!Files.exists(indexDir)) {
        Files.createDirectories(indexDir);
    }
    System.out.println(String.format("Creating index at %s...", indexArgs.path));
    Directory d = FSDirectory.open(indexDir);
    Map<String, Analyzer> map = new HashMap<>();
    map.put(FIELD_VECTOR, vectorAnalyzer);
    Analyzer analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(), map);
    IndexWriterConfig conf = new IndexWriterConfig(analyzer);
    IndexWriter indexWriter = new IndexWriter(d, conf);
    final AtomicInteger cnt = new AtomicInteger();
    for (Map.Entry<String, List<float[]>> entry : vectors.entrySet()) {
        for (float[] vector : entry.getValue()) {
            Document doc = new Document();
            doc.add(new StringField(FIELD_ID, entry.getKey(), Field.Store.YES));
            StringBuilder sb = new StringBuilder();
            for (double fv : vector) {
                if (sb.length() > 0) {
                    sb.append(' ');
                }
                sb.append(fv);
            }
            doc.add(new TextField(FIELD_VECTOR, sb.toString(), indexArgs.stored ? Field.Store.YES : Field.Store.NO));
            try {
                indexWriter.addDocument(doc);
                int cur = cnt.incrementAndGet();
                if (cur % 100000 == 0) {
                    System.out.println(String.format("%s docs added", cnt));
                }
            } catch (IOException e) {
                System.err.println("Error while indexing: " + e.getLocalizedMessage());
            }
        }
    }
    indexWriter.commit();
    System.out.println(String.format("%s docs indexed", cnt.get()));
    long space = FileUtils.sizeOfDirectory(indexDir.toFile()) / (1024L * 1024L);
    System.out.println(String.format("Index size: %dMB", space));
    indexWriter.close();
    d.close();
    final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
    System.out.println(String.format("Total time: %s", DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")));
}
Also used : HashMap(java.util.HashMap) LexicalLshAnalyzer(io.anserini.ann.lexlsh.LexicalLshAnalyzer) FakeWordsEncoderAnalyzer(io.anserini.ann.fw.FakeWordsEncoderAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) Document(org.apache.lucene.document.Document) TextField(org.apache.lucene.document.TextField) LinkedList(java.util.LinkedList) List(java.util.List) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Path(java.nio.file.Path) CmdLineParser(org.kohsuke.args4j.CmdLineParser) FakeWordsEncoderAnalyzer(io.anserini.ann.fw.FakeWordsEncoderAnalyzer) LexicalLshAnalyzer(io.anserini.ann.lexlsh.LexicalLshAnalyzer) IOException(java.io.IOException) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper) IndexWriter(org.apache.lucene.index.IndexWriter) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) StringField(org.apache.lucene.document.StringField) HashMap(java.util.HashMap) Map(java.util.Map) CmdLineException(org.kohsuke.args4j.CmdLineException) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 14 with PerFieldAnalyzerWrapper

use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project stargate-core by tuplejump.

the class CassandraUtils method getOptions.

public static Options getOptions(Properties mapping, ColumnFamilyStore baseCfs, String colName) {
    Map<String, NumericConfig> numericFieldOptions = new HashMap<>();
    Map<String, FieldType> fieldDocValueTypes = new TreeMap<>();
    Map<String, FieldType> collectionFieldDocValueTypes = new TreeMap<>();
    Map<String, FieldType> fieldTypes = new TreeMap<>();
    Map<String, FieldType[]> collectionFieldTypes = new TreeMap<>();
    Map<String, ColumnDefinition> validators = new TreeMap<>();
    Map<String, ColumnDefinition> clusteringKeysIndexed = new LinkedHashMap<>();
    Map<String, ColumnDefinition> partitionKeysIndexed = new LinkedHashMap<>();
    Set<String> indexedColumnNames;
    // getForRow all the fields options.
    indexedColumnNames = new TreeSet<>();
    indexedColumnNames.addAll(mapping.getFields().keySet());
    Set<String> added = new HashSet<>(indexedColumnNames.size());
    List<ColumnDefinition> partitionKeys = baseCfs.metadata.partitionKeyColumns();
    List<ColumnDefinition> clusteringKeys = baseCfs.metadata.clusteringColumns();
    for (ColumnDefinition colDef : partitionKeys) {
        String columnName = colDef.name.toString();
        if (Options.logger.isDebugEnabled()) {
            Options.logger.debug("Partition key name is {} and index is {}", colName, colDef.position());
        }
        validators.put(columnName, colDef);
        if (indexedColumnNames.contains(columnName)) {
            partitionKeysIndexed.put(colName, colDef);
            addPropertiesAndFieldType(mapping, numericFieldOptions, fieldDocValueTypes, collectionFieldDocValueTypes, fieldTypes, collectionFieldTypes, added, colDef, columnName);
        }
    }
    for (ColumnDefinition colDef : clusteringKeys) {
        String columnName = colDef.name.toString();
        if (Options.logger.isDebugEnabled()) {
            Options.logger.debug("Clustering key name is {} and index is {}", colName, colDef.position() + 1);
        }
        validators.put(columnName, colDef);
        if (indexedColumnNames.contains(columnName)) {
            clusteringKeysIndexed.put(columnName, colDef);
            addPropertiesAndFieldType(mapping, numericFieldOptions, fieldDocValueTypes, collectionFieldDocValueTypes, fieldTypes, collectionFieldTypes, added, colDef, columnName);
        }
    }
    for (String columnName : indexedColumnNames) {
        if (added.add(columnName.toLowerCase())) {
            Properties options = mapping.getFields().get(columnName);
            ColumnDefinition colDef = getColumnDefinition(baseCfs, columnName);
            if (colDef != null) {
                validators.put(columnName, colDef);
                addFieldType(columnName, colDef.type, options, numericFieldOptions, fieldDocValueTypes, collectionFieldDocValueTypes, fieldTypes, collectionFieldTypes);
            } else {
                throw new IllegalArgumentException(String.format("Column Definition for %s not found", columnName));
            }
            if (options.getType() == Type.object) {
                mapping.getFields().putAll(options.getFields());
            }
        }
    }
    Set<ColumnDefinition> otherColumns = baseCfs.metadata.regularColumns();
    for (ColumnDefinition colDef : otherColumns) {
        String columnName = UTF8Type.instance.getString(colDef.name.bytes);
        validators.put(columnName, colDef);
    }
    numericFieldOptions.putAll(mapping.getDynamicNumericConfig());
    Analyzer defaultAnalyzer = mapping.getLuceneAnalyzer();
    Analyzer analyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer, mapping.perFieldAnalyzers());
    Map<String, Type> types = new TreeMap<>();
    Set<String> nestedFields = new TreeSet<>();
    for (Map.Entry<String, ColumnDefinition> entry : validators.entrySet()) {
        CQL3Type cql3Type = entry.getValue().type.asCQL3Type();
        AbstractType inner = getValueValidator(cql3Type.getType());
        if (cql3Type.isCollection()) {
            types.put(entry.getKey(), fromAbstractType(inner.asCQL3Type()));
            nestedFields.add(entry.getKey());
        } else {
            types.put(entry.getKey(), fromAbstractType(cql3Type));
        }
    }
    return new Options(mapping, numericFieldOptions, fieldDocValueTypes, collectionFieldDocValueTypes, fieldTypes, collectionFieldTypes, types, nestedFields, clusteringKeysIndexed, partitionKeysIndexed, indexedColumnNames, analyzer, colName);
}
Also used : CQL3Type(org.apache.cassandra.cql3.CQL3Type) Options(com.tuplejump.stargate.lucene.Options) Properties(com.tuplejump.stargate.lucene.Properties) Analyzer(org.apache.lucene.analysis.Analyzer) FieldType(org.apache.lucene.document.FieldType) ColumnDefinition(org.apache.cassandra.config.ColumnDefinition) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper) Type(com.tuplejump.stargate.lucene.Type) FieldType(org.apache.lucene.document.FieldType) CQL3Type(org.apache.cassandra.cql3.CQL3Type) NumericConfig(org.apache.lucene.queryparser.flexible.standard.config.NumericConfig)

Example 15 with PerFieldAnalyzerWrapper

use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project HongsCORE by ihongs.

the class LuceneRecord method getAnalyzer.

// ** 底层工具 **/
/**
 * 存储分析器
 * @return
 * @throws HongsException
 * @deprecated 不再需要提前预设, 改为写入值时构建 TokenStream
 */
protected Analyzer getAnalyzer() throws HongsException {
    /*Default*/
    Analyzer ad = new StandardAnalyzer();
    Map<String, Analyzer> az = new HashMap();
    Map<String, Map> fs = getFields();
    for (Map.Entry<String, Map> et : fs.entrySet()) {
        String fn = et.getKey();
        Map fc = et.getValue();
        if (srchable(fc)) {
            // 注意: 搜索对应的字段名开头为 $
            az.put("$" + fn, getAnalyzer(fc));
        }
    }
    return new PerFieldAnalyzerWrapper(ad, az);
}
Also used : HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) CustomAnalyzer(org.apache.lucene.analysis.custom.CustomAnalyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper)

Aggregations

Analyzer (org.apache.lucene.analysis.Analyzer)22 PerFieldAnalyzerWrapper (org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper)22 HashMap (java.util.HashMap)12 RAMDirectory (org.apache.lucene.store.RAMDirectory)11 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)10 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)8 Document (org.apache.lucene.document.Document)8 TextField (org.apache.lucene.document.TextField)8 IndexWriter (org.apache.lucene.index.IndexWriter)8 Field (org.apache.lucene.document.Field)7 Directory (org.apache.lucene.store.Directory)6 Test (org.junit.Test)6 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)4 Tokenizer (org.apache.lucene.analysis.Tokenizer)4 WhitespaceAnalyzer (org.apache.lucene.analysis.core.WhitespaceAnalyzer)4 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)4 DirectoryReader (org.apache.lucene.index.DirectoryReader)4 IOException (java.io.IOException)3 Map (java.util.Map)3 SKOSAnalyzer (at.ac.univie.mminf.luceneSKOS.analysis.SKOSAnalyzer)2