Search in sources :

Example 91 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project elasticsearch-skywalker by jprante.

the class DocumentReconstructor method reconstruct.

/**
     * Reconstruct an index shard
     *
     * @return reconstructed document
     * @throws Exception
     */
public XContentBuilder reconstruct(int shardId) throws IOException {
    XContentBuilder builder = jsonBuilder();
    builder.startObject().field("shardId", shardId).field("numDeletions", reader.numDeletedDocs());
    builder.startArray("docs");
    FieldInfos fieldInfos = reader.getFieldInfos();
    Bits live = MultiFields.getLiveDocs(reader);
    for (int docNum = 0; docNum < reader.maxDoc(); docNum++) {
        Document doc = reader.document(docNum);
        if (live != null && live.get(docNum)) {
            // not deleted
            continue;
        }
        builder.startObject().startArray("fields");
        if (fieldInfos != null) {
            for (FieldInfo fi : fieldInfos) {
                String name = fi.name;
                IndexableField[] fs = doc.getFields(name);
                if (fs != null && fs.length > 0) {
                    for (IndexableField f : fs) {
                        IndexableFieldToXContent x = new IndexableFieldToXContent().field(f);
                        x.toXContent(builder, ToXContent.EMPTY_PARAMS);
                    }
                }
            }
        }
        builder.endArray();
        builder.startArray("terms");
        if (fieldInfos != null) {
            TermsEnum te = null;
            DocsAndPositionsEnum dpe = null;
            for (FieldInfo fi : fieldInfos) {
                Terms terms = MultiFields.getTerms(reader, fi.name);
                if (terms == null) {
                    // no terms in this field
                    continue;
                }
                te = terms.iterator(te);
                while (te.next() != null) {
                    DocsAndPositionsEnum newDpe = te.docsAndPositions(live, dpe, 0);
                    if (newDpe == null) {
                        // no position info for this field
                        break;
                    }
                    dpe = newDpe;
                    int num = dpe.advance(docNum);
                    if (num != docNum) {
                        // no data for this term in this doc
                        continue;
                    }
                    String text = te.term().utf8ToString();
                    List<Integer> positions = new ArrayList();
                    List<Integer> starts = new ArrayList();
                    List<Integer> ends = new ArrayList();
                    for (int k = 0; k < dpe.freq(); k++) {
                        int pos = dpe.nextPosition();
                        positions.add(pos);
                        starts.add(dpe.startOffset());
                        ends.add(dpe.endOffset());
                    }
                    builder.startObject().field("text", text).field("positions", positions).field("starts", starts).field("ends", ends).field("count", dpe.freq()).endObject();
                }
            }
        }
        builder.endArray();
        builder.endObject();
    }
    builder.endArray();
    builder.endObject();
    return builder;
}
Also used : Terms(org.apache.lucene.index.Terms) ArrayList(java.util.ArrayList) Document(org.apache.lucene.document.Document) TermsEnum(org.apache.lucene.index.TermsEnum) FieldInfos(org.apache.lucene.index.FieldInfos) IndexableField(org.apache.lucene.index.IndexableField) IndexableFieldToXContent(org.xbib.elasticsearch.action.skywalker.support.IndexableFieldToXContent) DocsAndPositionsEnum(org.apache.lucene.index.DocsAndPositionsEnum) Bits(org.apache.lucene.util.Bits) XContentBuilder(org.elasticsearch.common.xcontent.XContentBuilder) FieldInfo(org.apache.lucene.index.FieldInfo)

Example 92 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project languagetool by languagetool-org.

the class HomophoneOccurrenceDumper method dumpOccurrences.

private void dumpOccurrences(Set<String> tokens) throws IOException {
    Objects.requireNonNull(tokens);
    TermsEnum iterator = getIterator();
    BytesRef byteRef;
    int i = 0;
    while ((byteRef = iterator.next()) != null) {
        String term = new String(byteRef.bytes, byteRef.offset, byteRef.length);
        String[] split = term.split(" ");
        if (split.length == 3) {
            String token = split[1];
            if (tokens.contains(token)) {
                long count = getCount(Arrays.asList(split[0], split[1], split[2]));
                if (count >= MIN_COUNT) {
                    System.out.println(token + "\t" + count + "\t" + split[0] + " " + split[1] + " " + split[2]);
                }
            }
        }
        if (i % 10_000 == 0) {
            System.err.println(i + "...");
        }
        i++;
    }
}
Also used : BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 93 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project languagetool by languagetool-org.

the class HomophoneOccurrenceDumper method getContext.

/**
   * Get the context (left and right words) for the given word(s). This is slow,
   * as it needs to scan the whole index.
   */
Map<String, Long> getContext(String... tokens) throws IOException {
    Objects.requireNonNull(tokens);
    TermsEnum iterator = getIterator();
    Map<String, Long> result = new HashMap<>();
    BytesRef byteRef;
    int i = 0;
    while ((byteRef = iterator.next()) != null) {
        String term = new String(byteRef.bytes, byteRef.offset, byteRef.length);
        for (String token : tokens) {
            if (term.contains(" " + token + " ")) {
                String[] split = term.split(" ");
                if (split.length == 3) {
                    long count = getCount(Arrays.asList(split[0], split[1], split[2]));
                    result.put(term, count);
                }
            }
        }
    /*if (i++ > 1_000_000) { // comment in for faster testing with subsets of the data
        break;
      }*/
    }
    return result;
}
Also used : BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 94 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project elasticsearch by elastic.

the class TermVectorsFilter method selectBestTerms.

public void selectBestTerms() throws IOException {
    PostingsEnum docsEnum = null;
    for (String fieldName : fields) {
        if ((selectedFields != null) && (!selectedFields.contains(fieldName))) {
            continue;
        }
        Terms terms = fields.terms(fieldName);
        Terms topLevelTerms = topLevelFields.terms(fieldName);
        // if no terms found, take the retrieved term vector fields for stats
        if (topLevelTerms == null) {
            topLevelTerms = terms;
        }
        long numDocs = getDocCount(fieldName, topLevelTerms);
        // one queue per field name
        ScoreTermsQueue queue = new ScoreTermsQueue(Math.min(maxNumTerms, (int) terms.size()));
        // select terms with highest tf-idf
        TermsEnum termsEnum = terms.iterator();
        TermsEnum topLevelTermsEnum = topLevelTerms.iterator();
        while (termsEnum.next() != null) {
            BytesRef termBytesRef = termsEnum.term();
            boolean foundTerm = topLevelTermsEnum.seekExact(termBytesRef);
            assert foundTerm : "Term: " + termBytesRef.utf8ToString() + " not found!";
            Term term = new Term(fieldName, termBytesRef);
            // remove noise words
            int freq = getTermFreq(termsEnum, docsEnum);
            if (isNoise(term.bytes().utf8ToString(), freq)) {
                continue;
            }
            // now call on docFreq
            long docFreq = getTermStatistics(topLevelTermsEnum, term).docFreq();
            if (!isAccepted(docFreq)) {
                continue;
            }
            // filter based on score
            float score = computeScore(docFreq, freq, numDocs);
            queue.addOrUpdate(new ScoreTerm(term.field(), term.bytes().utf8ToString(), score));
        }
        // retain the best terms for quick lookups
        ScoreTerm scoreTerm;
        int count = 0;
        while ((scoreTerm = queue.pop()) != null) {
            scoreTerms.put(new Term(scoreTerm.field, scoreTerm.word), scoreTerm);
            count++;
        }
        sizes.put(fieldName, count);
    }
}
Also used : Terms(org.apache.lucene.index.Terms) Term(org.apache.lucene.index.Term) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 95 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project elasticsearch by elastic.

the class TermVectorsWriter method setFields.

void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields, @Nullable AggregatedDfs dfs, @Nullable TermVectorsFilter termVectorsFilter) throws IOException {
    int numFieldsWritten = 0;
    PostingsEnum docsAndPosEnum = null;
    PostingsEnum docsEnum = null;
    boolean hasScores = termVectorsFilter != null;
    for (String field : termVectorsByField) {
        if ((selectedFields != null) && (!selectedFields.contains(field))) {
            continue;
        }
        Terms fieldTermVector = termVectorsByField.terms(field);
        Terms topLevelTerms = topLevelFields.terms(field);
        // if no terms found, take the retrieved term vector fields for stats
        if (topLevelTerms == null) {
            topLevelTerms = EMPTY_TERMS;
        }
        TermsEnum topLevelIterator = topLevelTerms.iterator();
        boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions();
        boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets();
        boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads();
        long termsSize = fieldTermVector.size();
        if (hasScores) {
            termsSize = Math.min(termsSize, termVectorsFilter.size(field));
        }
        startField(field, termsSize, positions, offsets, payloads);
        if (flags.contains(Flag.FieldStatistics)) {
            if (dfs != null) {
                writeFieldStatistics(dfs.fieldStatistics().get(field));
            } else {
                writeFieldStatistics(topLevelTerms);
            }
        }
        TermsEnum iterator = fieldTermVector.iterator();
        final boolean useDocsAndPos = positions || offsets || payloads;
        while (iterator.next() != null) {
            // iterate all terms of the current field
            BytesRef termBytesRef = iterator.term();
            Term term = new Term(field, termBytesRef);
            // with filtering we only keep the best terms
            if (hasScores && !termVectorsFilter.hasScoreTerm(term)) {
                continue;
            }
            startTerm(termBytesRef);
            if (flags.contains(Flag.TermStatistics)) {
                // get the doc frequency
                if (dfs != null) {
                    final TermStatistics statistics = dfs.termStatistics().get(term);
                    writeTermStatistics(statistics == null ? new TermStatistics(termBytesRef, 0, 0) : statistics);
                } else {
                    boolean foundTerm = topLevelIterator.seekExact(termBytesRef);
                    if (foundTerm) {
                        writeTermStatistics(topLevelIterator);
                    } else {
                        writeTermStatistics(new TermStatistics(termBytesRef, 0, 0));
                    }
                }
            }
            if (useDocsAndPos) {
                // given we have pos or offsets
                docsAndPosEnum = writeTermWithDocsAndPos(iterator, docsAndPosEnum, positions, offsets, payloads);
            } else {
                // if we do not have the positions stored, we need to
                // get the frequency from a PostingsEnum.
                docsEnum = writeTermWithDocsOnly(iterator, docsEnum);
            }
            if (hasScores) {
                writeScoreTerm(termVectorsFilter.getScoreTerm(term));
            }
        }
        numFieldsWritten++;
    }
    response.setTermVectorsField(output);
    response.setHeader(writeHeader(numFieldsWritten, flags.contains(Flag.TermStatistics), flags.contains(Flag.FieldStatistics), hasScores));
}
Also used : Terms(org.apache.lucene.index.Terms) Term(org.apache.lucene.index.Term) PostingsEnum(org.apache.lucene.index.PostingsEnum) TermStatistics(org.apache.lucene.search.TermStatistics) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Aggregations

TermsEnum (org.apache.lucene.index.TermsEnum)155 BytesRef (org.apache.lucene.util.BytesRef)116 Terms (org.apache.lucene.index.Terms)103 PostingsEnum (org.apache.lucene.index.PostingsEnum)52 ArrayList (java.util.ArrayList)31 Term (org.apache.lucene.index.Term)31 IndexReader (org.apache.lucene.index.IndexReader)29 LeafReader (org.apache.lucene.index.LeafReader)28 IOException (java.io.IOException)26 Fields (org.apache.lucene.index.Fields)26 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)25 Document (org.apache.lucene.document.Document)24 Directory (org.apache.lucene.store.Directory)24 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)19 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)18 HashMap (java.util.HashMap)12 HashSet (java.util.HashSet)11 DirectoryReader (org.apache.lucene.index.DirectoryReader)11 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)10 Bits (org.apache.lucene.util.Bits)10