Search in sources :

Example 1 with DocsAndPositionsEnum

use of org.apache.lucene.index.DocsAndPositionsEnum in project Krill by KorAP.

the class PositionsToOffset method offsets.

public HashMap<PositionsToOffsetArray, Integer[]> offsets() {
    if (processed)
        return offsets;
    if (DEBUG)
        log.trace("Process offsets");
    StringBuilder sb = new StringBuilder().append('_');
    try {
        Terms terms = atomic.reader().fields().terms(field);
        if (terms != null) {
            // TODO: Maybe reuse a termsEnum!
            final TermsEnum termsEnum = terms.iterator(null);
            for (PositionsToOffsetArray posDoc : positions) {
                if (this.exists(posDoc))
                    continue;
                int docID = posDoc.docID;
                /*
                    int pos = posDoc[1];
                    Integer[] posDoc2 = new Integer[2];
                    posDoc2[0] = docID;
                    posDoc2[1] = pos;
                    */
                sb.append(posDoc.pos);
                Term term = new Term(field, sb.toString());
                sb.setLength(1);
                // Set the position in the iterator to the term that is seeked
                if (termsEnum.seekExact(term.bytes())) {
                    if (DEBUG)
                        log.trace("Search for {} in doc {} with pos {}", term.toString(), posDoc.docID, posDoc.pos);
                    // Start an iterator to fetch all payloads of the term
                    DocsAndPositionsEnum docs = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_PAYLOADS);
                    if (docs.advance(docID) == docID) {
                        docs.nextPosition();
                        BytesRef payload = docs.getPayload();
                        if (payload.length == 8) {
                            bbOffset.clear();
                            bbOffset.put(payload.bytes, payload.offset, 8);
                            bbOffset.rewind();
                            Integer[] offsetArray = new Integer[2];
                            offsetArray[0] = bbOffset.getInt();
                            offsetArray[1] = bbOffset.getInt();
                            offsets.put(posDoc, offsetArray);
                            if (DEBUG)
                                log.trace("Found {}-{} for {}", offsetArray[0], offsetArray[1], term.toString());
                        } else {
                            log.error("Doc {} has no offsets stored for {}", docID, term.toString());
                        }
                        ;
                    }
                    ;
                }
                ;
            }
            ;
        }
        ;
    } catch (IOException e) {
        log.warn(e.getLocalizedMessage());
    }
    ;
    processed = true;
    positions.clear();
    return offsets;
}
Also used : DocsAndPositionsEnum(org.apache.lucene.index.DocsAndPositionsEnum) Terms(org.apache.lucene.index.Terms) Term(org.apache.lucene.index.Term) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 2 with DocsAndPositionsEnum

use of org.apache.lucene.index.DocsAndPositionsEnum in project elasticsearch-skywalker by jprante.

the class DocumentReconstructor method reconstruct.

/**
 * Reconstruct an index shard
 *
 * @return reconstructed document
 * @throws Exception
 */
public XContentBuilder reconstruct(int shardId) throws IOException {
    XContentBuilder builder = jsonBuilder();
    builder.startObject().field("shardId", shardId).field("numDeletions", reader.numDeletedDocs());
    builder.startArray("docs");
    FieldInfos fieldInfos = reader.getFieldInfos();
    Bits live = MultiFields.getLiveDocs(reader);
    for (int docNum = 0; docNum < reader.maxDoc(); docNum++) {
        Document doc = reader.document(docNum);
        if (live != null && live.get(docNum)) {
            // not deleted
            continue;
        }
        builder.startObject().startArray("fields");
        if (fieldInfos != null) {
            for (FieldInfo fi : fieldInfos) {
                String name = fi.name;
                IndexableField[] fs = doc.getFields(name);
                if (fs != null && fs.length > 0) {
                    for (IndexableField f : fs) {
                        IndexableFieldToXContent x = new IndexableFieldToXContent().field(f);
                        x.toXContent(builder, ToXContent.EMPTY_PARAMS);
                    }
                }
            }
        }
        builder.endArray();
        builder.startArray("terms");
        if (fieldInfos != null) {
            TermsEnum te = null;
            DocsAndPositionsEnum dpe = null;
            for (FieldInfo fi : fieldInfos) {
                Terms terms = MultiFields.getTerms(reader, fi.name);
                if (terms == null) {
                    // no terms in this field
                    continue;
                }
                te = terms.iterator(te);
                while (te.next() != null) {
                    DocsAndPositionsEnum newDpe = te.docsAndPositions(live, dpe, 0);
                    if (newDpe == null) {
                        // no position info for this field
                        break;
                    }
                    dpe = newDpe;
                    int num = dpe.advance(docNum);
                    if (num != docNum) {
                        // no data for this term in this doc
                        continue;
                    }
                    String text = te.term().utf8ToString();
                    List<Integer> positions = new ArrayList();
                    List<Integer> starts = new ArrayList();
                    List<Integer> ends = new ArrayList();
                    for (int k = 0; k < dpe.freq(); k++) {
                        int pos = dpe.nextPosition();
                        positions.add(pos);
                        starts.add(dpe.startOffset());
                        ends.add(dpe.endOffset());
                    }
                    builder.startObject().field("text", text).field("positions", positions).field("starts", starts).field("ends", ends).field("count", dpe.freq()).endObject();
                }
            }
        }
        builder.endArray();
        builder.endObject();
    }
    builder.endArray();
    builder.endObject();
    return builder;
}
Also used : Terms(org.apache.lucene.index.Terms) ArrayList(java.util.ArrayList) Document(org.apache.lucene.document.Document) TermsEnum(org.apache.lucene.index.TermsEnum) FieldInfos(org.apache.lucene.index.FieldInfos) IndexableField(org.apache.lucene.index.IndexableField) IndexableFieldToXContent(org.xbib.elasticsearch.action.skywalker.support.IndexableFieldToXContent) DocsAndPositionsEnum(org.apache.lucene.index.DocsAndPositionsEnum) Bits(org.apache.lucene.util.Bits) XContentBuilder(org.elasticsearch.common.xcontent.XContentBuilder) FieldInfo(org.apache.lucene.index.FieldInfo)

Aggregations

DocsAndPositionsEnum (org.apache.lucene.index.DocsAndPositionsEnum)2 Terms (org.apache.lucene.index.Terms)2 TermsEnum (org.apache.lucene.index.TermsEnum)2 ArrayList (java.util.ArrayList)1 Document (org.apache.lucene.document.Document)1 FieldInfo (org.apache.lucene.index.FieldInfo)1 FieldInfos (org.apache.lucene.index.FieldInfos)1 IndexableField (org.apache.lucene.index.IndexableField)1 Term (org.apache.lucene.index.Term)1 Bits (org.apache.lucene.util.Bits)1 BytesRef (org.apache.lucene.util.BytesRef)1 XContentBuilder (org.elasticsearch.common.xcontent.XContentBuilder)1 IndexableFieldToXContent (org.xbib.elasticsearch.action.skywalker.support.IndexableFieldToXContent)1