use of org.apache.lucene.index.TermsEnum in project elasticsearch-skywalker by jprante.
the class DocumentReconstructor method reconstruct.
/**
* Reconstruct an index shard
*
* @return reconstructed document
* @throws Exception
*/
public XContentBuilder reconstruct(int shardId) throws IOException {
XContentBuilder builder = jsonBuilder();
builder.startObject().field("shardId", shardId).field("numDeletions", reader.numDeletedDocs());
builder.startArray("docs");
FieldInfos fieldInfos = reader.getFieldInfos();
Bits live = MultiFields.getLiveDocs(reader);
for (int docNum = 0; docNum < reader.maxDoc(); docNum++) {
Document doc = reader.document(docNum);
if (live != null && live.get(docNum)) {
// not deleted
continue;
}
builder.startObject().startArray("fields");
if (fieldInfos != null) {
for (FieldInfo fi : fieldInfos) {
String name = fi.name;
IndexableField[] fs = doc.getFields(name);
if (fs != null && fs.length > 0) {
for (IndexableField f : fs) {
IndexableFieldToXContent x = new IndexableFieldToXContent().field(f);
x.toXContent(builder, ToXContent.EMPTY_PARAMS);
}
}
}
}
builder.endArray();
builder.startArray("terms");
if (fieldInfos != null) {
TermsEnum te = null;
DocsAndPositionsEnum dpe = null;
for (FieldInfo fi : fieldInfos) {
Terms terms = MultiFields.getTerms(reader, fi.name);
if (terms == null) {
// no terms in this field
continue;
}
te = terms.iterator(te);
while (te.next() != null) {
DocsAndPositionsEnum newDpe = te.docsAndPositions(live, dpe, 0);
if (newDpe == null) {
// no position info for this field
break;
}
dpe = newDpe;
int num = dpe.advance(docNum);
if (num != docNum) {
// no data for this term in this doc
continue;
}
String text = te.term().utf8ToString();
List<Integer> positions = new ArrayList();
List<Integer> starts = new ArrayList();
List<Integer> ends = new ArrayList();
for (int k = 0; k < dpe.freq(); k++) {
int pos = dpe.nextPosition();
positions.add(pos);
starts.add(dpe.startOffset());
ends.add(dpe.endOffset());
}
builder.startObject().field("text", text).field("positions", positions).field("starts", starts).field("ends", ends).field("count", dpe.freq()).endObject();
}
}
}
builder.endArray();
builder.endObject();
}
builder.endArray();
builder.endObject();
return builder;
}
use of org.apache.lucene.index.TermsEnum in project languagetool by languagetool-org.
the class HomophoneOccurrenceDumper method dumpOccurrences.
private void dumpOccurrences(Set<String> tokens) throws IOException {
Objects.requireNonNull(tokens);
TermsEnum iterator = getIterator();
BytesRef byteRef;
int i = 0;
while ((byteRef = iterator.next()) != null) {
String term = new String(byteRef.bytes, byteRef.offset, byteRef.length);
String[] split = term.split(" ");
if (split.length == 3) {
String token = split[1];
if (tokens.contains(token)) {
long count = getCount(Arrays.asList(split[0], split[1], split[2]));
if (count >= MIN_COUNT) {
System.out.println(token + "\t" + count + "\t" + split[0] + " " + split[1] + " " + split[2]);
}
}
}
if (i % 10_000 == 0) {
System.err.println(i + "...");
}
i++;
}
}
use of org.apache.lucene.index.TermsEnum in project languagetool by languagetool-org.
the class HomophoneOccurrenceDumper method getContext.
/**
* Get the context (left and right words) for the given word(s). This is slow,
* as it needs to scan the whole index.
*/
Map<String, Long> getContext(String... tokens) throws IOException {
Objects.requireNonNull(tokens);
TermsEnum iterator = getIterator();
Map<String, Long> result = new HashMap<>();
BytesRef byteRef;
int i = 0;
while ((byteRef = iterator.next()) != null) {
String term = new String(byteRef.bytes, byteRef.offset, byteRef.length);
for (String token : tokens) {
if (term.contains(" " + token + " ")) {
String[] split = term.split(" ");
if (split.length == 3) {
long count = getCount(Arrays.asList(split[0], split[1], split[2]));
result.put(term, count);
}
}
}
/*if (i++ > 1_000_000) { // comment in for faster testing with subsets of the data
break;
}*/
}
return result;
}
use of org.apache.lucene.index.TermsEnum in project elasticsearch by elastic.
the class TermVectorsFilter method selectBestTerms.
public void selectBestTerms() throws IOException {
PostingsEnum docsEnum = null;
for (String fieldName : fields) {
if ((selectedFields != null) && (!selectedFields.contains(fieldName))) {
continue;
}
Terms terms = fields.terms(fieldName);
Terms topLevelTerms = topLevelFields.terms(fieldName);
// if no terms found, take the retrieved term vector fields for stats
if (topLevelTerms == null) {
topLevelTerms = terms;
}
long numDocs = getDocCount(fieldName, topLevelTerms);
// one queue per field name
ScoreTermsQueue queue = new ScoreTermsQueue(Math.min(maxNumTerms, (int) terms.size()));
// select terms with highest tf-idf
TermsEnum termsEnum = terms.iterator();
TermsEnum topLevelTermsEnum = topLevelTerms.iterator();
while (termsEnum.next() != null) {
BytesRef termBytesRef = termsEnum.term();
boolean foundTerm = topLevelTermsEnum.seekExact(termBytesRef);
assert foundTerm : "Term: " + termBytesRef.utf8ToString() + " not found!";
Term term = new Term(fieldName, termBytesRef);
// remove noise words
int freq = getTermFreq(termsEnum, docsEnum);
if (isNoise(term.bytes().utf8ToString(), freq)) {
continue;
}
// now call on docFreq
long docFreq = getTermStatistics(topLevelTermsEnum, term).docFreq();
if (!isAccepted(docFreq)) {
continue;
}
// filter based on score
float score = computeScore(docFreq, freq, numDocs);
queue.addOrUpdate(new ScoreTerm(term.field(), term.bytes().utf8ToString(), score));
}
// retain the best terms for quick lookups
ScoreTerm scoreTerm;
int count = 0;
while ((scoreTerm = queue.pop()) != null) {
scoreTerms.put(new Term(scoreTerm.field, scoreTerm.word), scoreTerm);
count++;
}
sizes.put(fieldName, count);
}
}
use of org.apache.lucene.index.TermsEnum in project elasticsearch by elastic.
the class TermVectorsWriter method setFields.
void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields, @Nullable AggregatedDfs dfs, @Nullable TermVectorsFilter termVectorsFilter) throws IOException {
int numFieldsWritten = 0;
PostingsEnum docsAndPosEnum = null;
PostingsEnum docsEnum = null;
boolean hasScores = termVectorsFilter != null;
for (String field : termVectorsByField) {
if ((selectedFields != null) && (!selectedFields.contains(field))) {
continue;
}
Terms fieldTermVector = termVectorsByField.terms(field);
Terms topLevelTerms = topLevelFields.terms(field);
// if no terms found, take the retrieved term vector fields for stats
if (topLevelTerms == null) {
topLevelTerms = EMPTY_TERMS;
}
TermsEnum topLevelIterator = topLevelTerms.iterator();
boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions();
boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets();
boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads();
long termsSize = fieldTermVector.size();
if (hasScores) {
termsSize = Math.min(termsSize, termVectorsFilter.size(field));
}
startField(field, termsSize, positions, offsets, payloads);
if (flags.contains(Flag.FieldStatistics)) {
if (dfs != null) {
writeFieldStatistics(dfs.fieldStatistics().get(field));
} else {
writeFieldStatistics(topLevelTerms);
}
}
TermsEnum iterator = fieldTermVector.iterator();
final boolean useDocsAndPos = positions || offsets || payloads;
while (iterator.next() != null) {
// iterate all terms of the current field
BytesRef termBytesRef = iterator.term();
Term term = new Term(field, termBytesRef);
// with filtering we only keep the best terms
if (hasScores && !termVectorsFilter.hasScoreTerm(term)) {
continue;
}
startTerm(termBytesRef);
if (flags.contains(Flag.TermStatistics)) {
// get the doc frequency
if (dfs != null) {
final TermStatistics statistics = dfs.termStatistics().get(term);
writeTermStatistics(statistics == null ? new TermStatistics(termBytesRef, 0, 0) : statistics);
} else {
boolean foundTerm = topLevelIterator.seekExact(termBytesRef);
if (foundTerm) {
writeTermStatistics(topLevelIterator);
} else {
writeTermStatistics(new TermStatistics(termBytesRef, 0, 0));
}
}
}
if (useDocsAndPos) {
// given we have pos or offsets
docsAndPosEnum = writeTermWithDocsAndPos(iterator, docsAndPosEnum, positions, offsets, payloads);
} else {
// if we do not have the positions stored, we need to
// get the frequency from a PostingsEnum.
docsEnum = writeTermWithDocsOnly(iterator, docsEnum);
}
if (hasScores) {
writeScoreTerm(termVectorsFilter.getScoreTerm(term));
}
}
numFieldsWritten++;
}
response.setTermVectorsField(output);
response.setHeader(writeHeader(numFieldsWritten, flags.contains(Flag.TermStatistics), flags.contains(Flag.FieldStatistics), hasScores));
}
Aggregations