use of org.apache.lucene.index.DocsAndPositionsEnum in project Krill by KorAP.
the class PositionsToOffset method offsets.
public HashMap<PositionsToOffsetArray, Integer[]> offsets() {
if (processed)
return offsets;
if (DEBUG)
log.trace("Process offsets");
StringBuilder sb = new StringBuilder().append('_');
try {
Terms terms = atomic.reader().fields().terms(field);
if (terms != null) {
// TODO: Maybe reuse a termsEnum!
final TermsEnum termsEnum = terms.iterator(null);
for (PositionsToOffsetArray posDoc : positions) {
if (this.exists(posDoc))
continue;
int docID = posDoc.docID;
/*
int pos = posDoc[1];
Integer[] posDoc2 = new Integer[2];
posDoc2[0] = docID;
posDoc2[1] = pos;
*/
sb.append(posDoc.pos);
Term term = new Term(field, sb.toString());
sb.setLength(1);
// Set the position in the iterator to the term that is seeked
if (termsEnum.seekExact(term.bytes())) {
if (DEBUG)
log.trace("Search for {} in doc {} with pos {}", term.toString(), posDoc.docID, posDoc.pos);
// Start an iterator to fetch all payloads of the term
DocsAndPositionsEnum docs = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_PAYLOADS);
if (docs.advance(docID) == docID) {
docs.nextPosition();
BytesRef payload = docs.getPayload();
if (payload.length == 8) {
bbOffset.clear();
bbOffset.put(payload.bytes, payload.offset, 8);
bbOffset.rewind();
Integer[] offsetArray = new Integer[2];
offsetArray[0] = bbOffset.getInt();
offsetArray[1] = bbOffset.getInt();
offsets.put(posDoc, offsetArray);
if (DEBUG)
log.trace("Found {}-{} for {}", offsetArray[0], offsetArray[1], term.toString());
} else {
log.error("Doc {} has no offsets stored for {}", docID, term.toString());
}
;
}
;
}
;
}
;
}
;
} catch (IOException e) {
log.warn(e.getLocalizedMessage());
}
;
processed = true;
positions.clear();
return offsets;
}
use of org.apache.lucene.index.DocsAndPositionsEnum in project elasticsearch-skywalker by jprante.
the class DocumentReconstructor method reconstruct.
/**
* Reconstruct an index shard
*
* @return reconstructed document
* @throws Exception
*/
public XContentBuilder reconstruct(int shardId) throws IOException {
XContentBuilder builder = jsonBuilder();
builder.startObject().field("shardId", shardId).field("numDeletions", reader.numDeletedDocs());
builder.startArray("docs");
FieldInfos fieldInfos = reader.getFieldInfos();
Bits live = MultiFields.getLiveDocs(reader);
for (int docNum = 0; docNum < reader.maxDoc(); docNum++) {
Document doc = reader.document(docNum);
if (live != null && live.get(docNum)) {
// not deleted
continue;
}
builder.startObject().startArray("fields");
if (fieldInfos != null) {
for (FieldInfo fi : fieldInfos) {
String name = fi.name;
IndexableField[] fs = doc.getFields(name);
if (fs != null && fs.length > 0) {
for (IndexableField f : fs) {
IndexableFieldToXContent x = new IndexableFieldToXContent().field(f);
x.toXContent(builder, ToXContent.EMPTY_PARAMS);
}
}
}
}
builder.endArray();
builder.startArray("terms");
if (fieldInfos != null) {
TermsEnum te = null;
DocsAndPositionsEnum dpe = null;
for (FieldInfo fi : fieldInfos) {
Terms terms = MultiFields.getTerms(reader, fi.name);
if (terms == null) {
// no terms in this field
continue;
}
te = terms.iterator(te);
while (te.next() != null) {
DocsAndPositionsEnum newDpe = te.docsAndPositions(live, dpe, 0);
if (newDpe == null) {
// no position info for this field
break;
}
dpe = newDpe;
int num = dpe.advance(docNum);
if (num != docNum) {
// no data for this term in this doc
continue;
}
String text = te.term().utf8ToString();
List<Integer> positions = new ArrayList();
List<Integer> starts = new ArrayList();
List<Integer> ends = new ArrayList();
for (int k = 0; k < dpe.freq(); k++) {
int pos = dpe.nextPosition();
positions.add(pos);
starts.add(dpe.startOffset());
ends.add(dpe.endOffset());
}
builder.startObject().field("text", text).field("positions", positions).field("starts", starts).field("ends", ends).field("count", dpe.freq()).endObject();
}
}
}
builder.endArray();
builder.endObject();
}
builder.endArray();
builder.endObject();
return builder;
}
Aggregations