use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project ansj_seg by NLPchina.
the class IndexTest method indexTest.
@Test
public void indexTest() throws CorruptIndexException, LockObtainFailedException, IOException, ParseException {
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new AnsjAnalyzer(TYPE.index_ansj));
Directory directory = null;
IndexWriter iwriter = null;
IndexWriterConfig ic = new IndexWriterConfig(analyzer);
// 建立内存索引对象
directory = new RAMDirectory();
iwriter = new IndexWriter(directory, ic);
addContent(iwriter, "助推企业转型升级提供强有力的技术支持和服保障。中心的建成将使青岛的服务器承载能力突破10万台,达到世界一流水平。");
addContent(iwriter, "涉及民生的部分商品和服务成本监审政策");
addContent(iwriter, "我穿着和服");
iwriter.commit();
iwriter.close();
System.out.println("索引建立完毕");
Analyzer queryAnalyzer = new AnsjAnalyzer(AnsjAnalyzer.TYPE.dic_ansj);
System.out.println("index ok to search!");
search(queryAnalyzer, directory, "\"和服\"");
}
use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project jena by apache.
the class TextIndexLucene method getQueryAnalyzer.
private Analyzer getQueryAnalyzer(boolean usingSearchFor, String lang) {
if (usingSearchFor) {
Analyzer qa = multilingualQueryAnalyzers.get(lang);
if (qa == null) {
qa = new PerFieldAnalyzerWrapper(new QueryMultilingualAnalyzer(defaultAnalyzer, lang), analyzerPerField);
multilingualQueryAnalyzers.put(lang, qa);
}
return qa;
} else {
return queryAnalyzer;
}
}
use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project Anserini by castorini.
the class IndexVectors method main.
public static void main(String[] args) throws Exception {
IndexVectors.Args indexArgs = new IndexVectors.Args();
CmdLineParser parser = new CmdLineParser(indexArgs, ParserProperties.defaults().withUsageWidth(90));
try {
parser.parseArgument(args);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
System.err.println("Example: " + IndexVectors.class.getSimpleName() + parser.printExample(OptionHandlerFilter.REQUIRED));
return;
}
Analyzer vectorAnalyzer;
if (indexArgs.encoding.equalsIgnoreCase(FW)) {
vectorAnalyzer = new FakeWordsEncoderAnalyzer(indexArgs.q);
} else if (indexArgs.encoding.equalsIgnoreCase(LEXLSH)) {
vectorAnalyzer = new LexicalLshAnalyzer(indexArgs.decimals, indexArgs.ngrams, indexArgs.hashCount, indexArgs.bucketCount, indexArgs.hashSetSize);
} else {
parser.printUsage(System.err);
System.err.println("Example: " + IndexVectors.class.getSimpleName() + parser.printExample(OptionHandlerFilter.REQUIRED));
return;
}
final long start = System.nanoTime();
System.out.println(String.format("Loading model %s", indexArgs.input));
Map<String, List<float[]>> vectors = readGloVe(indexArgs.input);
Path indexDir = indexArgs.path;
if (!Files.exists(indexDir)) {
Files.createDirectories(indexDir);
}
System.out.println(String.format("Creating index at %s...", indexArgs.path));
Directory d = FSDirectory.open(indexDir);
Map<String, Analyzer> map = new HashMap<>();
map.put(FIELD_VECTOR, vectorAnalyzer);
Analyzer analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(), map);
IndexWriterConfig conf = new IndexWriterConfig(analyzer);
IndexWriter indexWriter = new IndexWriter(d, conf);
final AtomicInteger cnt = new AtomicInteger();
for (Map.Entry<String, List<float[]>> entry : vectors.entrySet()) {
for (float[] vector : entry.getValue()) {
Document doc = new Document();
doc.add(new StringField(FIELD_ID, entry.getKey(), Field.Store.YES));
StringBuilder sb = new StringBuilder();
for (double fv : vector) {
if (sb.length() > 0) {
sb.append(' ');
}
sb.append(fv);
}
doc.add(new TextField(FIELD_VECTOR, sb.toString(), indexArgs.stored ? Field.Store.YES : Field.Store.NO));
try {
indexWriter.addDocument(doc);
int cur = cnt.incrementAndGet();
if (cur % 100000 == 0) {
System.out.println(String.format("%s docs added", cnt));
}
} catch (IOException e) {
System.err.println("Error while indexing: " + e.getLocalizedMessage());
}
}
}
indexWriter.commit();
System.out.println(String.format("%s docs indexed", cnt.get()));
long space = FileUtils.sizeOfDirectory(indexDir.toFile()) / (1024L * 1024L);
System.out.println(String.format("Index size: %dMB", space));
indexWriter.close();
d.close();
final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
System.out.println(String.format("Total time: %s", DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")));
}
use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project stargate-core by tuplejump.
the class CassandraUtils method getOptions.
public static Options getOptions(Properties mapping, ColumnFamilyStore baseCfs, String colName) {
Map<String, NumericConfig> numericFieldOptions = new HashMap<>();
Map<String, FieldType> fieldDocValueTypes = new TreeMap<>();
Map<String, FieldType> collectionFieldDocValueTypes = new TreeMap<>();
Map<String, FieldType> fieldTypes = new TreeMap<>();
Map<String, FieldType[]> collectionFieldTypes = new TreeMap<>();
Map<String, ColumnDefinition> validators = new TreeMap<>();
Map<String, ColumnDefinition> clusteringKeysIndexed = new LinkedHashMap<>();
Map<String, ColumnDefinition> partitionKeysIndexed = new LinkedHashMap<>();
Set<String> indexedColumnNames;
// getForRow all the fields options.
indexedColumnNames = new TreeSet<>();
indexedColumnNames.addAll(mapping.getFields().keySet());
Set<String> added = new HashSet<>(indexedColumnNames.size());
List<ColumnDefinition> partitionKeys = baseCfs.metadata.partitionKeyColumns();
List<ColumnDefinition> clusteringKeys = baseCfs.metadata.clusteringColumns();
for (ColumnDefinition colDef : partitionKeys) {
String columnName = colDef.name.toString();
if (Options.logger.isDebugEnabled()) {
Options.logger.debug("Partition key name is {} and index is {}", colName, colDef.position());
}
validators.put(columnName, colDef);
if (indexedColumnNames.contains(columnName)) {
partitionKeysIndexed.put(colName, colDef);
addPropertiesAndFieldType(mapping, numericFieldOptions, fieldDocValueTypes, collectionFieldDocValueTypes, fieldTypes, collectionFieldTypes, added, colDef, columnName);
}
}
for (ColumnDefinition colDef : clusteringKeys) {
String columnName = colDef.name.toString();
if (Options.logger.isDebugEnabled()) {
Options.logger.debug("Clustering key name is {} and index is {}", colName, colDef.position() + 1);
}
validators.put(columnName, colDef);
if (indexedColumnNames.contains(columnName)) {
clusteringKeysIndexed.put(columnName, colDef);
addPropertiesAndFieldType(mapping, numericFieldOptions, fieldDocValueTypes, collectionFieldDocValueTypes, fieldTypes, collectionFieldTypes, added, colDef, columnName);
}
}
for (String columnName : indexedColumnNames) {
if (added.add(columnName.toLowerCase())) {
Properties options = mapping.getFields().get(columnName);
ColumnDefinition colDef = getColumnDefinition(baseCfs, columnName);
if (colDef != null) {
validators.put(columnName, colDef);
addFieldType(columnName, colDef.type, options, numericFieldOptions, fieldDocValueTypes, collectionFieldDocValueTypes, fieldTypes, collectionFieldTypes);
} else {
throw new IllegalArgumentException(String.format("Column Definition for %s not found", columnName));
}
if (options.getType() == Type.object) {
mapping.getFields().putAll(options.getFields());
}
}
}
Set<ColumnDefinition> otherColumns = baseCfs.metadata.regularColumns();
for (ColumnDefinition colDef : otherColumns) {
String columnName = UTF8Type.instance.getString(colDef.name.bytes);
validators.put(columnName, colDef);
}
numericFieldOptions.putAll(mapping.getDynamicNumericConfig());
Analyzer defaultAnalyzer = mapping.getLuceneAnalyzer();
Analyzer analyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer, mapping.perFieldAnalyzers());
Map<String, Type> types = new TreeMap<>();
Set<String> nestedFields = new TreeSet<>();
for (Map.Entry<String, ColumnDefinition> entry : validators.entrySet()) {
CQL3Type cql3Type = entry.getValue().type.asCQL3Type();
AbstractType inner = getValueValidator(cql3Type.getType());
if (cql3Type.isCollection()) {
types.put(entry.getKey(), fromAbstractType(inner.asCQL3Type()));
nestedFields.add(entry.getKey());
} else {
types.put(entry.getKey(), fromAbstractType(cql3Type));
}
}
return new Options(mapping, numericFieldOptions, fieldDocValueTypes, collectionFieldDocValueTypes, fieldTypes, collectionFieldTypes, types, nestedFields, clusteringKeysIndexed, partitionKeysIndexed, indexedColumnNames, analyzer, colName);
}
use of org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper in project HongsCORE by ihongs.
the class LuceneRecord method getAnalyzer.
// ** 底层工具 **/
/**
* 存储分析器
* @return
* @throws HongsException
* @deprecated 不再需要提前预设, 改为写入值时构建 TokenStream
*/
protected Analyzer getAnalyzer() throws HongsException {
/*Default*/
Analyzer ad = new StandardAnalyzer();
Map<String, Analyzer> az = new HashMap();
Map<String, Map> fs = getFields();
for (Map.Entry<String, Map> et : fs.entrySet()) {
String fn = et.getKey();
Map fc = et.getValue();
if (srchable(fc)) {
// 注意: 搜索对应的字段名开头为 $
az.put("$" + fn, getAnalyzer(fc));
}
}
return new PerFieldAnalyzerWrapper(ad, az);
}
Aggregations