use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class SimpleNaiveBayesDocumentClassifier method assignNormClasses.
private List<ClassificationResult<BytesRef>> assignNormClasses(Document inputDocument) throws IOException {
List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();
Map<String, List<String[]>> fieldName2tokensArray = new LinkedHashMap<>();
Map<String, Float> fieldName2boost = new LinkedHashMap<>();
Terms classes = MultiFields.getTerms(indexReader, classFieldName);
TermsEnum classesEnum = classes.iterator();
BytesRef c;
analyzeSeedDocument(inputDocument, fieldName2tokensArray, fieldName2boost);
int docsWithClassSize = countDocsWithClass();
while ((c = classesEnum.next()) != null) {
double classScore = 0;
Term term = new Term(this.classFieldName, c);
for (String fieldName : textFieldNames) {
List<String[]> tokensArrays = fieldName2tokensArray.get(fieldName);
double fieldScore = 0;
for (String[] fieldTokensArray : tokensArrays) {
fieldScore += calculateLogPrior(term, docsWithClassSize) + calculateLogLikelihood(fieldTokensArray, fieldName, term, docsWithClassSize) * fieldName2boost.get(fieldName);
}
classScore += fieldScore;
}
assignedClasses.add(new ClassificationResult<>(term.bytes(), classScore));
}
return normClassificationResults(assignedClasses);
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class DocToDoubleVectorUtils method toSparseLocalFreqDoubleArray.
/**
* create a sparse <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc
*
* @param docTerms term vectors for a given document
* @param fieldTerms field term vectors
* @return a sparse vector of <code>Double</code>s as an array
* @throws IOException in case accessing the underlying index fails
*/
public static Double[] toSparseLocalFreqDoubleArray(Terms docTerms, Terms fieldTerms) throws IOException {
TermsEnum fieldTermsEnum = fieldTerms.iterator();
Double[] freqVector = null;
if (docTerms != null && fieldTerms.size() > -1) {
freqVector = new Double[(int) fieldTerms.size()];
int i = 0;
TermsEnum docTermsEnum = docTerms.iterator();
BytesRef term;
while ((term = fieldTermsEnum.next()) != null) {
TermsEnum.SeekStatus seekStatus = docTermsEnum.seekCeil(term);
if (seekStatus.equals(TermsEnum.SeekStatus.END)) {
docTermsEnum = docTerms.iterator();
}
if (seekStatus.equals(TermsEnum.SeekStatus.FOUND)) {
// the total number of occurrences of this term in the given document
long termFreqLocal = docTermsEnum.totalTermFreq();
freqVector[i] = Long.valueOf(termFreqLocal).doubleValue();
} else {
freqVector[i] = 0d;
}
i++;
}
}
return freqVector;
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class SimpleNaiveBayesClassifier method assignClassNormalizedList.
/**
* Calculate probabilities for all classes for a given input text
* @param inputDocument the input text as a {@code String}
* @return a {@code List} of {@code ClassificationResult}, one for each existing class
* @throws IOException if assigning probabilities fails
*/
protected List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument) throws IOException {
List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();
Terms classes = MultiFields.getTerms(indexReader, classFieldName);
if (classes != null) {
TermsEnum classesEnum = classes.iterator();
BytesRef next;
String[] tokenizedText = tokenize(inputDocument);
int docsWithClassSize = countDocsWithClass();
while ((next = classesEnum.next()) != null) {
if (next.length > 0) {
Term term = new Term(this.classFieldName, next);
double clVal = calculateLogPrior(term, docsWithClassSize) + calculateLogLikelihood(tokenizedText, term, docsWithClassSize);
assignedClasses.add(new ClassificationResult<>(term.bytes(), clVal));
}
}
}
// normalization; the values transforms to a 0-1 range
return normClassificationResults(assignedClasses);
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class BooleanPerceptronClassifier method updateWeights.
private void updateWeights(IndexReader indexReader, int docId, Boolean assignedClass, SortedMap<String, Double> weights, double modifier, boolean updateFST) throws IOException {
TermsEnum cte = textTerms.iterator();
// get the doc term vectors
Terms terms = indexReader.getTermVector(docId, textFieldName);
if (terms == null) {
throw new IOException("term vectors must be stored for field " + textFieldName);
}
TermsEnum termsEnum = terms.iterator();
BytesRef term;
while ((term = termsEnum.next()) != null) {
cte.seekExact(term);
if (assignedClass != null) {
long termFreqLocal = termsEnum.totalTermFreq();
// update weights
Long previousValue = Util.get(fst, term);
String termString = term.utf8ToString();
weights.put(termString, previousValue == null ? 0 : Math.max(0, previousValue + modifier * termFreqLocal));
}
}
if (updateFST) {
updateFST(weights);
}
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class BM25NBClassifier method assignClassNormalizedList.
/**
* Calculate probabilities for all classes for a given input text
*
* @param inputDocument the input text as a {@code String}
* @return a {@code List} of {@code ClassificationResult}, one for each existing class
* @throws IOException if assigning probabilities fails
*/
private List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument) throws IOException {
List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();
Terms classes = MultiFields.getTerms(indexReader, classFieldName);
TermsEnum classesEnum = classes.iterator();
BytesRef next;
String[] tokenizedText = tokenize(inputDocument);
while ((next = classesEnum.next()) != null) {
if (next.length > 0) {
Term term = new Term(this.classFieldName, next);
assignedClasses.add(new ClassificationResult<>(term.bytes(), calculateLogPrior(term) + calculateLogLikelihood(tokenizedText, term)));
}
}
return normClassificationResults(assignedClasses);
}
Aggregations