Search in sources :

Example 1 with SuperBit

use of info.debatty.java.lsh.SuperBit in project zuliasearch by zuliaio.

the class ServerIndexConfig method configure.

public void configure(IndexSettings indexSettings) {
    this.indexSettings = indexSettings;
    this.analyzerMap = new ConcurrentHashMap<>();
    analyzerMap.put(DefaultAnalyzers.STANDARD, AnalyzerSettings.newBuilder().setName(DefaultAnalyzers.STANDARD).addFilter(Filter.LOWERCASE).addFilter(Filter.STOPWORDS).build());
    analyzerMap.put(DefaultAnalyzers.KEYWORD, AnalyzerSettings.newBuilder().setName(DefaultAnalyzers.KEYWORD).setTokenizer(Tokenizer.KEYWORD).build());
    analyzerMap.put(DefaultAnalyzers.LC_KEYWORD, AnalyzerSettings.newBuilder().setName(DefaultAnalyzers.LC_KEYWORD).setTokenizer(Tokenizer.KEYWORD).addFilter(Filter.LOWERCASE).build());
    analyzerMap.put(DefaultAnalyzers.MIN_STEM, AnalyzerSettings.newBuilder().setName(DefaultAnalyzers.MIN_STEM).setTokenizer(Tokenizer.STANDARD).addFilter(Filter.LOWERCASE).addFilter(Filter.STOPWORDS).addFilter(Filter.ENGLISH_MIN_STEM).build());
    analyzerMap.put(DefaultAnalyzers.TWO_TWO_SHINGLE, AnalyzerSettings.newBuilder().setName(DefaultAnalyzers.TWO_TWO_SHINGLE).setTokenizer(Tokenizer.STANDARD).addFilter(Filter.LOWERCASE).addFilter(Filter.TWO_TWO_SHINGLE).build());
    analyzerMap.put(DefaultAnalyzers.THREE_THREE_SHINGLE, AnalyzerSettings.newBuilder().setName(DefaultAnalyzers.THREE_THREE_SHINGLE).setTokenizer(Tokenizer.STANDARD).addFilter(Filter.LOWERCASE).addFilter(Filter.THREE_THREE_SHINGLE).build());
    analyzerMap.put(DefaultAnalyzers.LC_CONCAT_ALL, AnalyzerSettings.newBuilder().setName(DefaultAnalyzers.LC_CONCAT_ALL).setTokenizer(Tokenizer.KEYWORD).addFilter(Filter.LOWERCASE).addFilter(Filter.CONCAT_ALL).build());
    analyzerMap.put(DefaultAnalyzers.KSTEMMED, AnalyzerSettings.newBuilder().setName(DefaultAnalyzers.KSTEMMED).setTokenizer(Tokenizer.STANDARD).addFilter(Filter.LOWERCASE).addFilter(Filter.STOPWORDS).addFilter(Filter.KSTEM).build());
    analyzerMap.put(DefaultAnalyzers.LSH, AnalyzerSettings.newBuilder().setName(DefaultAnalyzers.LSH).setTokenizer(Tokenizer.STANDARD).addFilter(Filter.LOWERCASE).addFilter(Filter.ASCII_FOLDING).addFilter(Filter.KSTEM).addFilter(Filter.STOPWORDS).addFilter(Filter.FIVE_FIVE_SHINGLE).addFilter(Filter.MINHASH).build());
    for (AnalyzerSettings analyzerSettings : indexSettings.getAnalyzerSettingsList()) {
        analyzerMap.put(analyzerSettings.getName(), analyzerSettings);
    }
    this.fieldConfigMap = new ConcurrentHashMap<>();
    for (FieldConfig fc : indexSettings.getFieldConfigList()) {
        fieldConfigMap.put(fc.getStoredFieldName(), fc);
    }
    this.indexAsMap = new ConcurrentHashMap<>();
    this.indexToStoredMap = new ConcurrentHashMap<>();
    for (String storedFieldName : fieldConfigMap.keySet()) {
        FieldConfig fc = fieldConfigMap.get(storedFieldName);
        for (IndexAs indexAs : fc.getIndexAsList()) {
            indexAsMap.put(indexAs.getIndexFieldName(), indexAs);
            indexToStoredMap.put(indexAs.getIndexFieldName(), storedFieldName);
        }
    }
    this.facetAsMap = new ConcurrentHashMap<>();
    for (String storedFieldName : fieldConfigMap.keySet()) {
        FieldConfig fc = fieldConfigMap.get(storedFieldName);
        for (FacetAs facetAs : fc.getFacetAsList()) {
            facetAsMap.put(facetAs.getFacetName(), facetAs);
        }
    }
    this.indexFieldType = new ConcurrentHashMap<>();
    for (String storedFieldName : fieldConfigMap.keySet()) {
        FieldConfig fc = fieldConfigMap.get(storedFieldName);
        for (IndexAs indexAs : fc.getIndexAsList()) {
            indexFieldType.put(indexAs.getIndexFieldName(), fc.getFieldType());
        }
    }
    this.sortFieldType = new ConcurrentHashMap<>();
    for (String storedFieldName : fieldConfigMap.keySet()) {
        FieldConfig fc = fieldConfigMap.get(storedFieldName);
        for (SortAs sortAs : fc.getSortAsList()) {
            sortFieldType.put(sortAs.getSortFieldName(), fc.getFieldType());
        }
    }
    this.superbitConfigMap = new ConcurrentHashMap<>();
    this.superbitMap = new ConcurrentHashMap<>();
    for (String storedFieldName : fieldConfigMap.keySet()) {
        FieldConfig fc = fieldConfigMap.get(storedFieldName);
        for (ProjectAs projectAs : fc.getProjectAsList()) {
            String field = projectAs.getField();
            if (projectAs.hasSuperbit()) {
                Superbit superbit = projectAs.getSuperbit();
                superbitConfigMap.put(field, superbit);
                SuperBit superBit = new SuperBit(superbit.getInputDim(), superbit.getInputDim(), superbit.getBatches(), superbit.getSeed());
                superbitMap.put(field, superBit);
            }
        }
    }
}
Also used : SortAs(io.zulia.message.ZuliaIndex.SortAs) FacetAs(io.zulia.message.ZuliaIndex.FacetAs) FieldConfig(io.zulia.message.ZuliaIndex.FieldConfig) ProjectAs(io.zulia.message.ZuliaIndex.ProjectAs) IndexAs(io.zulia.message.ZuliaIndex.IndexAs) AnalyzerSettings(io.zulia.message.ZuliaIndex.AnalyzerSettings) Superbit(io.zulia.message.ZuliaIndex.Superbit) SuperBit(info.debatty.java.lsh.SuperBit)

Example 2 with SuperBit

use of info.debatty.java.lsh.SuperBit in project zuliasearch by zuliaio.

the class ZuliaIndex method getCosineSimQuery.

private Query getCosineSimQuery(ZuliaQuery.Query query, double[] vector, String field) {
    SuperBit superBit = indexConfig.getSuperBitForField(field);
    boolean[] signature = superBit.signature(vector);
    int mm = (int) ((1 - (Math.acos(query.getVectorSimilarity()) / Math.PI)) * signature.length);
    BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
    booleanQueryBuilder.setMinimumNumberShouldMatch(mm);
    for (int i = 0; i < signature.length; i++) {
        String fieldName = ZuliaConstants.SUPERBIT_PREFIX + "." + field + "." + i;
        booleanQueryBuilder.add(new BooleanClause(new TermQuery(new org.apache.lucene.index.Term(fieldName, signature[i] ? "1" : "0")), BooleanClause.Occur.SHOULD));
    }
    return booleanQueryBuilder.build();
}
Also used : BooleanClause(org.apache.lucene.search.BooleanClause) BooleanQuery(org.apache.lucene.search.BooleanQuery) TermQuery(org.apache.lucene.search.TermQuery) SuperBit(info.debatty.java.lsh.SuperBit)

Example 3 with SuperBit

use of info.debatty.java.lsh.SuperBit in project zuliasearch by zuliaio.

the class ShardDocumentIndexer method handleProjectForStoredField.

private void handleProjectForStoredField(Document luceneDocument, ZuliaIndex.FieldConfig fc, Object o) throws Exception {
    for (ZuliaIndex.ProjectAs projectAs : fc.getProjectAsList()) {
        if (projectAs.hasSuperbit()) {
            if (o instanceof List) {
                List<Number> values = (List<Number>) o;
                double[] vec = new double[values.size()];
                int i = 0;
                for (Number value : values) {
                    vec[i++] = value.doubleValue();
                }
                SuperBit superBitForField = indexConfig.getSuperBitForField(projectAs.getField());
                boolean[] signature = superBitForField.signature(vec);
                int j = 0;
                for (boolean s : signature) {
                    StringFieldIndexer.INSTANCE.index(luceneDocument, projectAs.getField(), s ? "1" : "0", ZuliaConstants.SUPERBIT_PREFIX + "." + projectAs.getField() + "." + j);
                    j++;
                }
            } else {
                throw new Exception("Expecting a list for superbit field <" + projectAs.getField() + ">");
            }
        }
    }
}
Also used : List(java.util.List) ZuliaIndex(io.zulia.message.ZuliaIndex) LongPoint(org.apache.lucene.document.LongPoint) SuperBit(info.debatty.java.lsh.SuperBit)

Aggregations

SuperBit (info.debatty.java.lsh.SuperBit)3 ZuliaIndex (io.zulia.message.ZuliaIndex)1 AnalyzerSettings (io.zulia.message.ZuliaIndex.AnalyzerSettings)1 FacetAs (io.zulia.message.ZuliaIndex.FacetAs)1 FieldConfig (io.zulia.message.ZuliaIndex.FieldConfig)1 IndexAs (io.zulia.message.ZuliaIndex.IndexAs)1 ProjectAs (io.zulia.message.ZuliaIndex.ProjectAs)1 SortAs (io.zulia.message.ZuliaIndex.SortAs)1 Superbit (io.zulia.message.ZuliaIndex.Superbit)1 List (java.util.List)1 LongPoint (org.apache.lucene.document.LongPoint)1 BooleanClause (org.apache.lucene.search.BooleanClause)1 BooleanQuery (org.apache.lucene.search.BooleanQuery)1 TermQuery (org.apache.lucene.search.TermQuery)1