Search in sources :

Example 51 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class SimpleNaiveBayesDocumentClassifier method assignNormClasses.

private List<ClassificationResult<BytesRef>> assignNormClasses(Document inputDocument) throws IOException {
    List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();
    Map<String, List<String[]>> fieldName2tokensArray = new LinkedHashMap<>();
    Map<String, Float> fieldName2boost = new LinkedHashMap<>();
    Terms classes = MultiFields.getTerms(indexReader, classFieldName);
    TermsEnum classesEnum = classes.iterator();
    BytesRef c;
    analyzeSeedDocument(inputDocument, fieldName2tokensArray, fieldName2boost);
    int docsWithClassSize = countDocsWithClass();
    while ((c = classesEnum.next()) != null) {
        double classScore = 0;
        Term term = new Term(this.classFieldName, c);
        for (String fieldName : textFieldNames) {
            List<String[]> tokensArrays = fieldName2tokensArray.get(fieldName);
            double fieldScore = 0;
            for (String[] fieldTokensArray : tokensArrays) {
                fieldScore += calculateLogPrior(term, docsWithClassSize) + calculateLogLikelihood(fieldTokensArray, fieldName, term, docsWithClassSize) * fieldName2boost.get(fieldName);
            }
            classScore += fieldScore;
        }
        assignedClasses.add(new ClassificationResult<>(term.bytes(), classScore));
    }
    return normClassificationResults(assignedClasses);
}
Also used : ArrayList(java.util.ArrayList) Terms(org.apache.lucene.index.Terms) ClassificationResult(org.apache.lucene.classification.ClassificationResult) Term(org.apache.lucene.index.Term) LinkedHashMap(java.util.LinkedHashMap) TermsEnum(org.apache.lucene.index.TermsEnum) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) List(java.util.List) BytesRef(org.apache.lucene.util.BytesRef)

Example 52 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class DocToDoubleVectorUtils method toSparseLocalFreqDoubleArray.

/**
   * create a sparse <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc
   *
   * @param docTerms   term vectors for a given document
   * @param fieldTerms field term vectors
   * @return a sparse vector of <code>Double</code>s as an array
   * @throws IOException in case accessing the underlying index fails
   */
public static Double[] toSparseLocalFreqDoubleArray(Terms docTerms, Terms fieldTerms) throws IOException {
    TermsEnum fieldTermsEnum = fieldTerms.iterator();
    Double[] freqVector = null;
    if (docTerms != null && fieldTerms.size() > -1) {
        freqVector = new Double[(int) fieldTerms.size()];
        int i = 0;
        TermsEnum docTermsEnum = docTerms.iterator();
        BytesRef term;
        while ((term = fieldTermsEnum.next()) != null) {
            TermsEnum.SeekStatus seekStatus = docTermsEnum.seekCeil(term);
            if (seekStatus.equals(TermsEnum.SeekStatus.END)) {
                docTermsEnum = docTerms.iterator();
            }
            if (seekStatus.equals(TermsEnum.SeekStatus.FOUND)) {
                // the total number of occurrences of this term in the given document
                long termFreqLocal = docTermsEnum.totalTermFreq();
                freqVector[i] = Long.valueOf(termFreqLocal).doubleValue();
            } else {
                freqVector[i] = 0d;
            }
            i++;
        }
    }
    return freqVector;
}
Also used : BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 53 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class SimpleNaiveBayesClassifier method assignClassNormalizedList.

/**
   * Calculate probabilities for all classes for a given input text
   * @param inputDocument the input text as a {@code String}
   * @return a {@code List} of {@code ClassificationResult}, one for each existing class
   * @throws IOException if assigning probabilities fails
   */
protected List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument) throws IOException {
    List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();
    Terms classes = MultiFields.getTerms(indexReader, classFieldName);
    if (classes != null) {
        TermsEnum classesEnum = classes.iterator();
        BytesRef next;
        String[] tokenizedText = tokenize(inputDocument);
        int docsWithClassSize = countDocsWithClass();
        while ((next = classesEnum.next()) != null) {
            if (next.length > 0) {
                Term term = new Term(this.classFieldName, next);
                double clVal = calculateLogPrior(term, docsWithClassSize) + calculateLogLikelihood(tokenizedText, term, docsWithClassSize);
                assignedClasses.add(new ClassificationResult<>(term.bytes(), clVal));
            }
        }
    }
    // normalization; the values transforms to a 0-1 range
    return normClassificationResults(assignedClasses);
}
Also used : ArrayList(java.util.ArrayList) Terms(org.apache.lucene.index.Terms) Term(org.apache.lucene.index.Term) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 54 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class BooleanPerceptronClassifier method updateWeights.

private void updateWeights(IndexReader indexReader, int docId, Boolean assignedClass, SortedMap<String, Double> weights, double modifier, boolean updateFST) throws IOException {
    TermsEnum cte = textTerms.iterator();
    // get the doc term vectors
    Terms terms = indexReader.getTermVector(docId, textFieldName);
    if (terms == null) {
        throw new IOException("term vectors must be stored for field " + textFieldName);
    }
    TermsEnum termsEnum = terms.iterator();
    BytesRef term;
    while ((term = termsEnum.next()) != null) {
        cte.seekExact(term);
        if (assignedClass != null) {
            long termFreqLocal = termsEnum.totalTermFreq();
            // update weights
            Long previousValue = Util.get(fst, term);
            String termString = term.utf8ToString();
            weights.put(termString, previousValue == null ? 0 : Math.max(0, previousValue + modifier * termFreqLocal));
        }
    }
    if (updateFST) {
        updateFST(weights);
    }
}
Also used : Terms(org.apache.lucene.index.Terms) IOException(java.io.IOException) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 55 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class BM25NBClassifier method assignClassNormalizedList.

/**
   * Calculate probabilities for all classes for a given input text
   *
   * @param inputDocument the input text as a {@code String}
   * @return a {@code List} of {@code ClassificationResult}, one for each existing class
   * @throws IOException if assigning probabilities fails
   */
private List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument) throws IOException {
    List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();
    Terms classes = MultiFields.getTerms(indexReader, classFieldName);
    TermsEnum classesEnum = classes.iterator();
    BytesRef next;
    String[] tokenizedText = tokenize(inputDocument);
    while ((next = classesEnum.next()) != null) {
        if (next.length > 0) {
            Term term = new Term(this.classFieldName, next);
            assignedClasses.add(new ClassificationResult<>(term.bytes(), calculateLogPrior(term) + calculateLogLikelihood(tokenizedText, term)));
        }
    }
    return normClassificationResults(assignedClasses);
}
Also used : ArrayList(java.util.ArrayList) Terms(org.apache.lucene.index.Terms) Term(org.apache.lucene.index.Term) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Aggregations

TermsEnum (org.apache.lucene.index.TermsEnum)155 BytesRef (org.apache.lucene.util.BytesRef)116 Terms (org.apache.lucene.index.Terms)103 PostingsEnum (org.apache.lucene.index.PostingsEnum)52 ArrayList (java.util.ArrayList)31 Term (org.apache.lucene.index.Term)31 IndexReader (org.apache.lucene.index.IndexReader)29 LeafReader (org.apache.lucene.index.LeafReader)28 IOException (java.io.IOException)26 Fields (org.apache.lucene.index.Fields)26 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)25 Document (org.apache.lucene.document.Document)24 Directory (org.apache.lucene.store.Directory)24 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)19 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)18 HashMap (java.util.HashMap)12 HashSet (java.util.HashSet)11 DirectoryReader (org.apache.lucene.index.DirectoryReader)11 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)10 Bits (org.apache.lucene.util.Bits)10