Search in sources :

Example 6 with ByteArrayDataInput

use of org.apache.lucene.store.ByteArrayDataInput in project lucene-solr by apache.

the class IDVersionSegmentTermsEnum method printSeekState.

@SuppressWarnings("unused")
private void printSeekState(PrintStream out) throws IOException {
    if (currentFrame == staticFrame) {
        out.println("  no prior seek");
    } else {
        out.println("  prior seek state:");
        int ord = 0;
        boolean isSeekFrame = true;
        while (true) {
            IDVersionSegmentTermsEnumFrame f = getFrame(ord);
            assert f != null;
            final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefix);
            if (f.nextEnt == -1) {
                out.println("    frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + brToString(prefix) + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0) + (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd());
            } else {
                out.println("    frame " + (isSeekFrame ? "(seek, loaded)" : "(next, loaded)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + brToString(prefix) + " nextEnt=" + f.nextEnt + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0) + (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0)) + " lastSubFP=" + f.lastSubFP + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd());
            }
            if (fr.index != null) {
                assert !isSeekFrame || f.arc != null : "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc;
                if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.byteAt(f.prefix - 1) & 0xFF)) {
                    out.println("      broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.byteAt(f.prefix - 1) & 0xFF));
                    throw new RuntimeException("seek state is broken");
                }
                Pair<BytesRef, Long> output = Util.get(fr.index, prefix);
                if (output == null) {
                    out.println("      broken seek state: prefix is not final in index");
                    throw new RuntimeException("seek state is broken");
                } else if (isSeekFrame && !f.isFloor) {
                    final ByteArrayDataInput reader = new ByteArrayDataInput(output.output1.bytes, output.output1.offset, output.output1.length);
                    final long codeOrig = reader.readVLong();
                    final long code = (f.fp << VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS : 0) | (f.isFloor ? VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR : 0);
                    if (codeOrig != code) {
                        out.println("      broken seek state: output code=" + codeOrig + " doesn't match frame code=" + code);
                        throw new RuntimeException("seek state is broken");
                    }
                }
            }
            if (f == currentFrame) {
                break;
            }
            if (f.prefix == validIndexPrefix) {
                isSeekFrame = false;
            }
            ord++;
        }
    }
}
Also used : ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) BytesRef(org.apache.lucene.util.BytesRef)

Example 7 with ByteArrayDataInput

use of org.apache.lucene.store.ByteArrayDataInput in project lucene-solr by apache.

the class IDVersionSegmentTermsEnumFrame method loadBlock.

/* Does initial decode of next block of terms; this
     doesn't actually decode the docFreq, totalTermFreq,
     postings details (frq/prx offset, etc.) metadata;
     it just loads them as byte[] blobs which are then      
     decoded on-demand if the metadata is ever requested
     for any term in this block.  This enables terms-only
     intensive consumes (eg certain MTQs, respelling) to
     not pay the price of decoding metadata they won't
     use. */
void loadBlock() throws IOException {
    // Clone the IndexInput lazily, so that consumers
    // that just pull a TermsEnum to
    // seekExact(TermState) don't pay this cost:
    ste.initIndexInput();
    if (nextEnt != -1) {
        // Already loaded
        return;
    }
    //System.out.println("blc=" + blockLoadCount);
    ste.in.seek(fp);
    int code = ste.in.readVInt();
    entCount = code >>> 1;
    assert entCount > 0;
    isLastInFloor = (code & 1) != 0;
    assert arc == null || (isLastInFloor || isFloor);
    // TODO: if suffixes were stored in random-access
    // array structure, then we could do binary search
    // instead of linear scan to find target term; eg
    // we could have simple array of offsets
    // term suffixes:
    code = ste.in.readVInt();
    isLeafBlock = (code & 1) != 0;
    int numBytes = code >>> 1;
    if (suffixBytes.length < numBytes) {
        suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
    }
    ste.in.readBytes(suffixBytes, 0, numBytes);
    suffixesReader.reset(suffixBytes, 0, numBytes);
    /*if (DEBUG) {
      if (arc == null) {
      System.out.println("    loadBlock (next) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock);
      } else {
      System.out.println("    loadBlock (seek) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock);
      }
      }*/
    metaDataUpto = 0;
    state.termBlockOrd = 0;
    nextEnt = 0;
    lastSubFP = -1;
    // TODO: we could skip this if !hasTerms; but
    // that's rare so won't help much
    // metadata
    numBytes = ste.in.readVInt();
    if (bytes == null) {
        bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
        bytesReader = new ByteArrayDataInput();
    } else if (bytes.length < numBytes) {
        bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
    }
    ste.in.readBytes(bytes, 0, numBytes);
    bytesReader.reset(bytes, 0, numBytes);
    // Sub-blocks of a single floor block are always
    // written one after another -- tail recurse:
    fpEnd = ste.in.getFilePointer();
// if (DEBUG) {
//   System.out.println("      fpEnd=" + fpEnd);
// }
}
Also used : ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput)

Example 8 with ByteArrayDataInput

use of org.apache.lucene.store.ByteArrayDataInput in project lucene-solr by apache.

the class AnalyzingSuggester method build.

@Override
public void build(InputIterator iterator) throws IOException {
    if (iterator.hasContexts()) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    hasPayloads = iterator.hasPayloads();
    OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, new AnalyzingComparator(hasPayloads));
    IndexOutput tempInput = tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
    OfflineSorter.ByteSequencesReader reader = null;
    BytesRefBuilder scratch = new BytesRefBuilder();
    TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
    String tempSortedFileName = null;
    count = 0;
    byte[] buffer = new byte[8];
    try {
        ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
        for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null; ) {
            LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
            for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
                Util.toBytesRef(string, scratch);
                // length of the analyzed text (FST input)
                if (scratch.length() > Short.MAX_VALUE - 2) {
                    throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE - 2) + " in length (got " + scratch.length() + ")");
                }
                short analyzedLength = (short) scratch.length();
                // compute the required length:
                // analyzed sequence + weight (4) + surface + analyzedLength (short)
                int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
                BytesRef payload;
                if (hasPayloads) {
                    if (surfaceForm.length > (Short.MAX_VALUE - 2)) {
                        throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE - 2) + " in length (got " + surfaceForm.length + ")");
                    }
                    payload = iterator.payload();
                    // payload + surfaceLength (short)
                    requiredLength += payload.length + 2;
                } else {
                    payload = null;
                }
                buffer = ArrayUtil.grow(buffer, requiredLength);
                output.reset(buffer);
                output.writeShort(analyzedLength);
                output.writeBytes(scratch.bytes(), 0, scratch.length());
                output.writeInt(encodeWeight(iterator.weight()));
                if (hasPayloads) {
                    for (int i = 0; i < surfaceForm.length; i++) {
                        if (surfaceForm.bytes[i] == PAYLOAD_SEP) {
                            throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
                        }
                    }
                    output.writeShort((short) surfaceForm.length);
                    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
                    output.writeBytes(payload.bytes, payload.offset, payload.length);
                } else {
                    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
                }
                assert output.getPosition() == requiredLength : output.getPosition() + " vs " + requiredLength;
                writer.write(buffer, 0, output.getPosition());
            }
            maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size());
        }
        CodecUtil.writeFooter(tempInput);
        writer.close();
        // Sort all input/output pairs (required by FST.Builder):
        tempSortedFileName = sorter.sort(tempInput.getName());
        // Free disk space:
        tempDir.deleteFile(tempInput.getName());
        reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
        PairOutputs<Long, BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
        Builder<Pair<Long, BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
        // Build FST:
        BytesRefBuilder previousAnalyzed = null;
        BytesRefBuilder analyzed = new BytesRefBuilder();
        BytesRef surface = new BytesRef();
        IntsRefBuilder scratchInts = new IntsRefBuilder();
        ByteArrayDataInput input = new ByteArrayDataInput();
        // Used to remove duplicate surface forms (but we
        // still index the hightest-weight one).  We clear
        // this when we see a new analyzed form, so it cannot
        // grow unbounded (at most 256 entries):
        Set<BytesRef> seenSurfaceForms = new HashSet<>();
        int dedup = 0;
        while (true) {
            BytesRef bytes = reader.next();
            if (bytes == null) {
                break;
            }
            input.reset(bytes.bytes, bytes.offset, bytes.length);
            short analyzedLength = input.readShort();
            analyzed.grow(analyzedLength + 2);
            input.readBytes(analyzed.bytes(), 0, analyzedLength);
            analyzed.setLength(analyzedLength);
            long cost = input.readInt();
            surface.bytes = bytes.bytes;
            if (hasPayloads) {
                surface.length = input.readShort();
                surface.offset = input.getPosition();
            } else {
                surface.offset = input.getPosition();
                surface.length = bytes.length - surface.offset;
            }
            if (previousAnalyzed == null) {
                previousAnalyzed = new BytesRefBuilder();
                previousAnalyzed.copyBytes(analyzed.get());
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            } else if (analyzed.get().equals(previousAnalyzed.get())) {
                dedup++;
                if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
                    // dups: skip the rest:
                    continue;
                }
                if (seenSurfaceForms.contains(surface)) {
                    continue;
                }
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            } else {
                dedup = 0;
                previousAnalyzed.copyBytes(analyzed);
                seenSurfaceForms.clear();
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            }
            // TODO: I think we can avoid the extra 2 bytes when
            // there is no dup (dedup==0), but we'd have to fix
            // the exactFirst logic ... which would be sort of
            // hairy because we'd need to special case the two
            // (dup/not dup)...
            // NOTE: must be byte 0 so we sort before whatever
            // is next
            analyzed.append((byte) 0);
            analyzed.append((byte) dedup);
            Util.toIntsRef(analyzed.get(), scratchInts);
            //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
            if (!hasPayloads) {
                builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
            } else {
                int payloadOffset = input.getPosition() + surface.length;
                int payloadLength = bytes.length - payloadOffset;
                BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
                System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
                br.bytes[surface.length] = PAYLOAD_SEP;
                System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length + 1, payloadLength);
                br.length = br.bytes.length;
                builder.add(scratchInts.get(), outputs.newPair(cost, br));
            }
        }
        fst = builder.finish();
    //Util.dotToFile(fst, "/tmp/suggest.dot");
    } finally {
        IOUtils.closeWhileHandlingException(reader, writer);
        IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
    }
}
Also used : OfflineSorter(org.apache.lucene.util.OfflineSorter) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) Builder(org.apache.lucene.util.fst.Builder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) PairOutputs(org.apache.lucene.util.fst.PairOutputs) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) HashSet(java.util.HashSet) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IndexOutput(org.apache.lucene.store.IndexOutput) LimitedFiniteStringsIterator(org.apache.lucene.util.automaton.LimitedFiniteStringsIterator) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) ByteArrayDataOutput(org.apache.lucene.store.ByteArrayDataOutput) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton)

Example 9 with ByteArrayDataInput

use of org.apache.lucene.store.ByteArrayDataInput in project lucene-solr by apache.

the class NRTSuggester method lookup.

/**
   * Collects at most {@link TopSuggestDocsCollector#getCountToCollect()} completions that
   * match the provided {@link CompletionScorer}.
   * <p>
   * The {@link CompletionScorer#automaton} is intersected with the {@link #fst}.
   * {@link CompletionScorer#weight} is used to compute boosts and/or extract context
   * for each matched partial paths. A top N search is executed on {@link #fst} seeded with
   * the matched partial paths. Upon reaching a completed path, {@link CompletionScorer#accept(int, Bits)}
   * and {@link CompletionScorer#score(float, float)} is used on the document id, index weight
   * and query boost to filter and score the entry, before being collected via
   * {@link TopSuggestDocsCollector#collect(int, CharSequence, CharSequence, float)}
   */
public void lookup(final CompletionScorer scorer, Bits acceptDocs, final TopSuggestDocsCollector collector) throws IOException {
    final double liveDocsRatio = calculateLiveDocRatio(scorer.reader.numDocs(), scorer.reader.maxDoc());
    if (liveDocsRatio == -1) {
        return;
    }
    final List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(scorer.automaton, fst);
    // The topN is increased by a factor of # of intersected path
    // to ensure search admissibility. For example, one suggestion can
    // have multiple contexts, resulting in num_context paths for the
    // suggestion instead of 1 in the FST. When queried for the suggestion,
    // the topN value ensures that all paths to the suggestion are evaluated
    // (in case of a match all context query).
    // Note that collectors will early terminate as soon as enough suggestions
    // have been collected, regardless of the set topN value. This value is the
    // maximum number of suggestions that can be collected.
    final int topN = collector.getCountToCollect() * prefixPaths.size();
    final int queueSize = getMaxTopNSearcherQueueSize(topN, scorer.reader.numDocs(), liveDocsRatio, scorer.filtered);
    final CharsRefBuilder spare = new CharsRefBuilder();
    Comparator<Pair<Long, BytesRef>> comparator = getComparator();
    Util.TopNSearcher<Pair<Long, BytesRef>> searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, topN, queueSize, comparator, new ScoringPathComparator(scorer)) {

        private final ByteArrayDataInput scratchInput = new ByteArrayDataInput();

        @Override
        protected boolean acceptPartialPath(Util.FSTPath<Pair<Long, BytesRef>> path) {
            if (collector.doSkipDuplicates()) {
                // We are removing dups
                if (path.payload == -1) {
                    // This path didn't yet see the complete surface form; let's see if it just did with the arc output we just added:
                    BytesRef arcOutput = path.arc.output.output2;
                    BytesRef output = path.output.output2;
                    for (int i = 0; i < arcOutput.length; i++) {
                        if (arcOutput.bytes[arcOutput.offset + i] == payloadSep) {
                            // OK this arc that the path was just extended by contains the payloadSep, so we now have a full surface form in this path
                            path.payload = output.length - arcOutput.length + i;
                            assert output.bytes[output.offset + path.payload] == payloadSep;
                            break;
                        }
                    }
                }
                if (path.payload != -1) {
                    BytesRef output = path.output.output2;
                    spare.copyUTF8Bytes(output.bytes, output.offset, path.payload);
                    if (collector.seenSurfaceForms.contains(spare.chars(), 0, spare.length())) {
                        return false;
                    }
                }
            }
            return true;
        }

        @Override
        protected boolean acceptResult(Util.FSTPath<Pair<Long, BytesRef>> path) {
            BytesRef output = path.output.output2;
            int payloadSepIndex;
            if (path.payload != -1) {
                payloadSepIndex = path.payload;
                spare.copyUTF8Bytes(output.bytes, output.offset, payloadSepIndex);
            } else {
                assert collector.doSkipDuplicates() == false;
                payloadSepIndex = parseSurfaceForm(output, payloadSep, spare);
            }
            scratchInput.reset(output.bytes, output.offset + payloadSepIndex + 1, output.length - payloadSepIndex - 1);
            int docID = scratchInput.readVInt();
            if (!scorer.accept(docID, acceptDocs)) {
                return false;
            }
            if (collector.doSkipDuplicates()) {
                // now record that we've seen this surface form:
                char[] key = new char[spare.length()];
                System.arraycopy(spare.chars(), 0, key, 0, spare.length());
                if (collector.seenSurfaceForms.contains(key)) {
                    // we already collected a higher scoring document with this key, in this segment:
                    return false;
                }
                collector.seenSurfaceForms.add(key);
            }
            try {
                float score = scorer.score(decode(path.output.output1), path.boost);
                collector.collect(docID, spare.toCharsRef(), path.context, score);
                return true;
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    };
    for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
        scorer.weight.setNextMatch(path.input.get());
        BytesRef output = path.output.output2;
        int payload = -1;
        if (collector.doSkipDuplicates()) {
            for (int j = 0; j < output.length; j++) {
                if (output.bytes[output.offset + j] == payloadSep) {
                    // Important to cache this, else we have a possibly O(N^2) cost where N is the length of suggestions
                    payload = j;
                    break;
                }
            }
        }
        searcher.addStartPaths(path.fstNode, path.output, false, path.input, scorer.weight.boost(), scorer.weight.context(), payload);
    }
    // hits are also returned by search()
    // we do not use it, instead collect at acceptResult
    searcher.search();
// search admissibility is not guaranteed
// see comment on getMaxTopNSearcherQueueSize
// assert  search.isComplete;
}
Also used : FSTUtil(org.apache.lucene.search.suggest.analyzing.FSTUtil) Util(org.apache.lucene.util.fst.Util) IOException(java.io.IOException) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) FSTUtil(org.apache.lucene.search.suggest.analyzing.FSTUtil) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair)

Example 10 with ByteArrayDataInput

use of org.apache.lucene.store.ByteArrayDataInput in project lucene-solr by apache.

the class FSTCompletionLookup method build.

@Override
public void build(InputIterator iterator) throws IOException {
    if (iterator.hasPayloads()) {
        throw new IllegalArgumentException("this suggester doesn't support payloads");
    }
    if (iterator.hasContexts()) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix);
    ExternalRefSorter externalSorter = new ExternalRefSorter(sorter);
    IndexOutput tempInput = tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
    String tempSortedFileName = null;
    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
    OfflineSorter.ByteSequencesReader reader = null;
    // Push floats up front before sequences to sort them. For now, assume they are non-negative.
    // If negative floats are allowed some trickery needs to be done to find their byte order.
    count = 0;
    try {
        byte[] buffer = new byte[0];
        ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
        BytesRef spare;
        int inputLineCount = 0;
        while ((spare = iterator.next()) != null) {
            if (spare.length + 4 >= buffer.length) {
                buffer = ArrayUtil.grow(buffer, spare.length + 4);
            }
            output.reset(buffer);
            output.writeInt(encodeWeight(iterator.weight()));
            output.writeBytes(spare.bytes, spare.offset, spare.length);
            writer.write(buffer, 0, output.getPosition());
            inputLineCount++;
        }
        CodecUtil.writeFooter(tempInput);
        writer.close();
        // We don't know the distribution of scores and we need to bucket them, so we'll sort
        // and divide into equal buckets.
        tempSortedFileName = sorter.sort(tempInput.getName());
        tempDir.deleteFile(tempInput.getName());
        FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, externalSorter, sharedTailLength);
        reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
        long line = 0;
        int previousBucket = 0;
        int previousScore = 0;
        ByteArrayDataInput input = new ByteArrayDataInput();
        BytesRef tmp2 = new BytesRef();
        while (true) {
            BytesRef scratch = reader.next();
            if (scratch == null) {
                break;
            }
            input.reset(scratch.bytes, scratch.offset, scratch.length);
            int currentScore = input.readInt();
            int bucket;
            if (line > 0 && currentScore == previousScore) {
                bucket = previousBucket;
            } else {
                bucket = (int) (line * buckets / inputLineCount);
            }
            previousScore = currentScore;
            previousBucket = bucket;
            // Only append the input, discard the weight.
            tmp2.bytes = scratch.bytes;
            tmp2.offset = scratch.offset + input.getPosition();
            tmp2.length = scratch.length - input.getPosition();
            builder.add(tmp2, bucket);
            line++;
            count++;
        }
        // The two FSTCompletions share the same automaton.
        this.higherWeightsCompletion = builder.build();
        this.normalCompletion = new FSTCompletion(higherWeightsCompletion.getFST(), false, exactMatchFirst);
    } finally {
        IOUtils.closeWhileHandlingException(reader, writer, externalSorter);
        IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
    }
}
Also used : OfflineSorter(org.apache.lucene.util.OfflineSorter) IndexOutput(org.apache.lucene.store.IndexOutput) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) ByteArrayDataOutput(org.apache.lucene.store.ByteArrayDataOutput) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

ByteArrayDataInput (org.apache.lucene.store.ByteArrayDataInput)26 BytesRef (org.apache.lucene.util.BytesRef)16 ByteArrayDataOutput (org.apache.lucene.store.ByteArrayDataOutput)8 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)5 IndexOutput (org.apache.lucene.store.IndexOutput)4 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)4 IOException (java.io.IOException)3 HashSet (java.util.HashSet)3 Random (java.util.Random)3 IntPoint (org.apache.lucene.document.IntPoint)3 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)3 IntsRef (org.apache.lucene.util.IntsRef)3 OfflineSorter (org.apache.lucene.util.OfflineSorter)3 Pair (org.apache.lucene.util.fst.PairOutputs.Pair)3 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)2 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)2 Directory (org.apache.lucene.store.Directory)2 IndexInput (org.apache.lucene.store.IndexInput)2 CharsRef (org.apache.lucene.util.CharsRef)2 LimitedFiniteStringsIterator (org.apache.lucene.util.automaton.LimitedFiniteStringsIterator)2