use of info.debatty.java.lsh.SuperBit in project zuliasearch by zuliaio.
the class ServerIndexConfig method configure.
public void configure(IndexSettings indexSettings) {
this.indexSettings = indexSettings;
this.analyzerMap = new ConcurrentHashMap<>();
analyzerMap.put(DefaultAnalyzers.STANDARD, AnalyzerSettings.newBuilder().setName(DefaultAnalyzers.STANDARD).addFilter(Filter.LOWERCASE).addFilter(Filter.STOPWORDS).build());
analyzerMap.put(DefaultAnalyzers.KEYWORD, AnalyzerSettings.newBuilder().setName(DefaultAnalyzers.KEYWORD).setTokenizer(Tokenizer.KEYWORD).build());
analyzerMap.put(DefaultAnalyzers.LC_KEYWORD, AnalyzerSettings.newBuilder().setName(DefaultAnalyzers.LC_KEYWORD).setTokenizer(Tokenizer.KEYWORD).addFilter(Filter.LOWERCASE).build());
analyzerMap.put(DefaultAnalyzers.MIN_STEM, AnalyzerSettings.newBuilder().setName(DefaultAnalyzers.MIN_STEM).setTokenizer(Tokenizer.STANDARD).addFilter(Filter.LOWERCASE).addFilter(Filter.STOPWORDS).addFilter(Filter.ENGLISH_MIN_STEM).build());
analyzerMap.put(DefaultAnalyzers.TWO_TWO_SHINGLE, AnalyzerSettings.newBuilder().setName(DefaultAnalyzers.TWO_TWO_SHINGLE).setTokenizer(Tokenizer.STANDARD).addFilter(Filter.LOWERCASE).addFilter(Filter.TWO_TWO_SHINGLE).build());
analyzerMap.put(DefaultAnalyzers.THREE_THREE_SHINGLE, AnalyzerSettings.newBuilder().setName(DefaultAnalyzers.THREE_THREE_SHINGLE).setTokenizer(Tokenizer.STANDARD).addFilter(Filter.LOWERCASE).addFilter(Filter.THREE_THREE_SHINGLE).build());
analyzerMap.put(DefaultAnalyzers.LC_CONCAT_ALL, AnalyzerSettings.newBuilder().setName(DefaultAnalyzers.LC_CONCAT_ALL).setTokenizer(Tokenizer.KEYWORD).addFilter(Filter.LOWERCASE).addFilter(Filter.CONCAT_ALL).build());
analyzerMap.put(DefaultAnalyzers.KSTEMMED, AnalyzerSettings.newBuilder().setName(DefaultAnalyzers.KSTEMMED).setTokenizer(Tokenizer.STANDARD).addFilter(Filter.LOWERCASE).addFilter(Filter.STOPWORDS).addFilter(Filter.KSTEM).build());
analyzerMap.put(DefaultAnalyzers.LSH, AnalyzerSettings.newBuilder().setName(DefaultAnalyzers.LSH).setTokenizer(Tokenizer.STANDARD).addFilter(Filter.LOWERCASE).addFilter(Filter.ASCII_FOLDING).addFilter(Filter.KSTEM).addFilter(Filter.STOPWORDS).addFilter(Filter.FIVE_FIVE_SHINGLE).addFilter(Filter.MINHASH).build());
for (AnalyzerSettings analyzerSettings : indexSettings.getAnalyzerSettingsList()) {
analyzerMap.put(analyzerSettings.getName(), analyzerSettings);
}
this.fieldConfigMap = new ConcurrentHashMap<>();
for (FieldConfig fc : indexSettings.getFieldConfigList()) {
fieldConfigMap.put(fc.getStoredFieldName(), fc);
}
this.indexAsMap = new ConcurrentHashMap<>();
this.indexToStoredMap = new ConcurrentHashMap<>();
for (String storedFieldName : fieldConfigMap.keySet()) {
FieldConfig fc = fieldConfigMap.get(storedFieldName);
for (IndexAs indexAs : fc.getIndexAsList()) {
indexAsMap.put(indexAs.getIndexFieldName(), indexAs);
indexToStoredMap.put(indexAs.getIndexFieldName(), storedFieldName);
}
}
this.facetAsMap = new ConcurrentHashMap<>();
for (String storedFieldName : fieldConfigMap.keySet()) {
FieldConfig fc = fieldConfigMap.get(storedFieldName);
for (FacetAs facetAs : fc.getFacetAsList()) {
facetAsMap.put(facetAs.getFacetName(), facetAs);
}
}
this.indexFieldType = new ConcurrentHashMap<>();
for (String storedFieldName : fieldConfigMap.keySet()) {
FieldConfig fc = fieldConfigMap.get(storedFieldName);
for (IndexAs indexAs : fc.getIndexAsList()) {
indexFieldType.put(indexAs.getIndexFieldName(), fc.getFieldType());
}
}
this.sortFieldType = new ConcurrentHashMap<>();
for (String storedFieldName : fieldConfigMap.keySet()) {
FieldConfig fc = fieldConfigMap.get(storedFieldName);
for (SortAs sortAs : fc.getSortAsList()) {
sortFieldType.put(sortAs.getSortFieldName(), fc.getFieldType());
}
}
this.superbitConfigMap = new ConcurrentHashMap<>();
this.superbitMap = new ConcurrentHashMap<>();
for (String storedFieldName : fieldConfigMap.keySet()) {
FieldConfig fc = fieldConfigMap.get(storedFieldName);
for (ProjectAs projectAs : fc.getProjectAsList()) {
String field = projectAs.getField();
if (projectAs.hasSuperbit()) {
Superbit superbit = projectAs.getSuperbit();
superbitConfigMap.put(field, superbit);
SuperBit superBit = new SuperBit(superbit.getInputDim(), superbit.getInputDim(), superbit.getBatches(), superbit.getSeed());
superbitMap.put(field, superBit);
}
}
}
}
use of info.debatty.java.lsh.SuperBit in project zuliasearch by zuliaio.
the class ZuliaIndex method getCosineSimQuery.
private Query getCosineSimQuery(ZuliaQuery.Query query, double[] vector, String field) {
SuperBit superBit = indexConfig.getSuperBitForField(field);
boolean[] signature = superBit.signature(vector);
int mm = (int) ((1 - (Math.acos(query.getVectorSimilarity()) / Math.PI)) * signature.length);
BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
booleanQueryBuilder.setMinimumNumberShouldMatch(mm);
for (int i = 0; i < signature.length; i++) {
String fieldName = ZuliaConstants.SUPERBIT_PREFIX + "." + field + "." + i;
booleanQueryBuilder.add(new BooleanClause(new TermQuery(new org.apache.lucene.index.Term(fieldName, signature[i] ? "1" : "0")), BooleanClause.Occur.SHOULD));
}
return booleanQueryBuilder.build();
}
use of info.debatty.java.lsh.SuperBit in project zuliasearch by zuliaio.
the class ShardDocumentIndexer method handleProjectForStoredField.
private void handleProjectForStoredField(Document luceneDocument, ZuliaIndex.FieldConfig fc, Object o) throws Exception {
for (ZuliaIndex.ProjectAs projectAs : fc.getProjectAsList()) {
if (projectAs.hasSuperbit()) {
if (o instanceof List) {
List<Number> values = (List<Number>) o;
double[] vec = new double[values.size()];
int i = 0;
for (Number value : values) {
vec[i++] = value.doubleValue();
}
SuperBit superBitForField = indexConfig.getSuperBitForField(projectAs.getField());
boolean[] signature = superBitForField.signature(vec);
int j = 0;
for (boolean s : signature) {
StringFieldIndexer.INSTANCE.index(luceneDocument, projectAs.getField(), s ? "1" : "0", ZuliaConstants.SUPERBIT_PREFIX + "." + projectAs.getField() + "." + j);
j++;
}
} else {
throw new Exception("Expecting a list for superbit field <" + projectAs.getField() + ">");
}
}
}
}
Aggregations