Search in sources :

Example 1 with OfflineSorter

use of org.apache.lucene.util.OfflineSorter in project elasticsearch by elastic.

the class XAnalyzingSuggester method build.

@Override
public void build(InputIterator iterator) throws IOException {
    String prefix = getClass().getSimpleName();
    Directory tempDir = getTempDir();
    OfflineSorter sorter = new OfflineSorter(tempDir, prefix, new AnalyzingComparator(hasPayloads));
    IndexOutput tempInput = tempDir.createTempOutput(prefix, "input", IOContext.DEFAULT);
    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
    OfflineSorter.ByteSequencesReader reader = null;
    hasPayloads = iterator.hasPayloads();
    BytesRefBuilder scratch = new BytesRefBuilder();
    TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
    String tempSortedFileName = null;
    count = 0;
    byte[] buffer = new byte[8];
    try {
        ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
        for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null; ) {
            LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
            for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
                Util.toBytesRef(string, scratch);
                // length of the analyzed text (FST input)
                if (scratch.length() > Short.MAX_VALUE - 2) {
                    throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE - 2) + " in length (got " + scratch.length() + ")");
                }
                short analyzedLength = (short) scratch.length();
                // compute the required length:
                // analyzed sequence + weight (4) + surface + analyzedLength (short)
                int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
                BytesRef payload;
                if (hasPayloads) {
                    if (surfaceForm.length > (Short.MAX_VALUE - 2)) {
                        throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE - 2) + " in length (got " + surfaceForm.length + ")");
                    }
                    payload = iterator.payload();
                    // payload + surfaceLength (short)
                    requiredLength += payload.length + 2;
                } else {
                    payload = null;
                }
                buffer = ArrayUtil.grow(buffer, requiredLength);
                output.reset(buffer);
                output.writeShort(analyzedLength);
                output.writeBytes(scratch.bytes(), 0, scratch.length());
                output.writeInt(encodeWeight(iterator.weight()));
                if (hasPayloads) {
                    for (int i = 0; i < surfaceForm.length; i++) {
                        if (surfaceForm.bytes[i] == payloadSep) {
                            throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
                        }
                    }
                    output.writeShort((short) surfaceForm.length);
                    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
                    output.writeBytes(payload.bytes, payload.offset, payload.length);
                } else {
                    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
                }
                assert output.getPosition() == requiredLength : output.getPosition() + " vs " + requiredLength;
                writer.write(buffer, 0, output.getPosition());
            }
            maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size());
        }
        writer.close();
        // Sort all input/output pairs (required by FST.Builder):
        tempSortedFileName = sorter.sort(tempInput.getName());
        // Free disk space:
        tempDir.deleteFile(tempInput.getName());
        reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), prefix);
        PairOutputs<Long, BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
        Builder<Pair<Long, BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
        // Build FST:
        BytesRefBuilder previousAnalyzed = null;
        BytesRefBuilder analyzed = new BytesRefBuilder();
        BytesRef surface = new BytesRef();
        IntsRefBuilder scratchInts = new IntsRefBuilder();
        ByteArrayDataInput input = new ByteArrayDataInput();
        // Used to remove duplicate surface forms (but we
        // still index the hightest-weight one).  We clear
        // this when we see a new analyzed form, so it cannot
        // grow unbounded (at most 256 entries):
        Set<BytesRef> seenSurfaceForms = new HashSet<>();
        int dedup = 0;
        while (true) {
            BytesRef bytes = reader.next();
            if (bytes == null) {
                break;
            }
            input.reset(bytes.bytes, bytes.offset, bytes.length);
            short analyzedLength = input.readShort();
            analyzed.grow(analyzedLength + 2);
            input.readBytes(analyzed.bytes(), 0, analyzedLength);
            analyzed.setLength(analyzedLength);
            long cost = input.readInt();
            surface.bytes = bytes.bytes;
            if (hasPayloads) {
                surface.length = input.readShort();
                surface.offset = input.getPosition();
            } else {
                surface.offset = input.getPosition();
                surface.length = bytes.length - surface.offset;
            }
            if (previousAnalyzed == null) {
                previousAnalyzed = new BytesRefBuilder();
                previousAnalyzed.copyBytes(analyzed);
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            } else if (analyzed.get().equals(previousAnalyzed.get())) {
                dedup++;
                if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
                    // dups: skip the rest:
                    continue;
                }
                if (seenSurfaceForms.contains(surface)) {
                    continue;
                }
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            } else {
                dedup = 0;
                previousAnalyzed.copyBytes(analyzed);
                seenSurfaceForms.clear();
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            }
            // TODO: I think we can avoid the extra 2 bytes when
            // there is no dup (dedup==0), but we'd have to fix
            // the exactFirst logic ... which would be sort of
            // hairy because we'd need to special case the two
            // (dup/not dup)...
            // NOTE: must be byte 0 so we sort before whatever
            // is next
            analyzed.append((byte) 0);
            analyzed.append((byte) dedup);
            Util.toIntsRef(analyzed.get(), scratchInts);
            //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
            if (!hasPayloads) {
                builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
            } else {
                int payloadOffset = input.getPosition() + surface.length;
                int payloadLength = bytes.length - payloadOffset;
                BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
                System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
                br.bytes[surface.length] = (byte) payloadSep;
                System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length + 1, payloadLength);
                br.length = br.bytes.length;
                builder.add(scratchInts.get(), outputs.newPair(cost, br));
            }
        }
        fst = builder.finish();
    //PrintWriter pw = new PrintWriter("/tmp/out.dot");
    //Util.toDot(fst, pw, true, true);
    //pw.close();
    } finally {
        IOUtils.closeWhileHandlingException(reader, writer);
        IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
    }
}
Also used : OfflineSorter(org.apache.lucene.util.OfflineSorter) Builder(org.apache.lucene.util.fst.Builder) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) PairOutputs(org.apache.lucene.util.fst.PairOutputs) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) HashSet(java.util.HashSet) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IndexOutput(org.apache.lucene.store.IndexOutput) LimitedFiniteStringsIterator(org.apache.lucene.util.automaton.LimitedFiniteStringsIterator) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) ByteArrayDataOutput(org.apache.lucene.store.ByteArrayDataOutput) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton)

Example 2 with OfflineSorter

use of org.apache.lucene.util.OfflineSorter in project lucene-solr by apache.

the class Dictionary method readDictionaryFiles.

/**
   * Reads the dictionary file through the provided InputStreams, building up the words map
   *
   * @param dictionaries InputStreams to read the dictionary file through
   * @param decoder CharsetDecoder used to decode the contents of the file
   * @throws IOException Can be thrown while reading from the file
   */
private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException {
    BytesRefBuilder flagsScratch = new BytesRefBuilder();
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    StringBuilder sb = new StringBuilder();
    IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
    try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
        for (InputStream dictionary : dictionaries) {
            BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
            // first line is number of entries (approximately, sometimes)
            String line = lines.readLine();
            while ((line = lines.readLine()) != null) {
                // wild and unpredictable code comment rules
                if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#' || line.charAt(0) == '\t') {
                    continue;
                }
                line = unescapeEntry(line);
                // if we havent seen any stem exceptions, try to parse one
                if (hasStemExceptions == false) {
                    int morphStart = line.indexOf(MORPH_SEPARATOR);
                    if (morphStart >= 0 && morphStart < line.length()) {
                        hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
                    }
                }
                if (needsInputCleaning) {
                    int flagSep = line.indexOf(FLAG_SEPARATOR);
                    if (flagSep == -1) {
                        flagSep = line.indexOf(MORPH_SEPARATOR);
                    }
                    if (flagSep == -1) {
                        CharSequence cleansed = cleanInput(line, sb);
                        writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
                    } else {
                        String text = line.substring(0, flagSep);
                        CharSequence cleansed = cleanInput(text, sb);
                        if (cleansed != sb) {
                            sb.setLength(0);
                            sb.append(cleansed);
                        }
                        sb.append(line.substring(flagSep));
                        writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
                    }
                } else {
                    writer.write(line.getBytes(StandardCharsets.UTF_8));
                }
            }
        }
        CodecUtil.writeFooter(unsorted);
    }
    OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, new Comparator<BytesRef>() {

        BytesRef scratch1 = new BytesRef();

        BytesRef scratch2 = new BytesRef();

        @Override
        public int compare(BytesRef o1, BytesRef o2) {
            scratch1.bytes = o1.bytes;
            scratch1.offset = o1.offset;
            scratch1.length = o1.length;
            for (int i = scratch1.length - 1; i >= 0; i--) {
                if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
                    scratch1.length = i;
                    break;
                }
            }
            scratch2.bytes = o2.bytes;
            scratch2.offset = o2.offset;
            scratch2.length = o2.length;
            for (int i = scratch2.length - 1; i >= 0; i--) {
                if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
                    scratch2.length = i;
                    break;
                }
            }
            int cmp = scratch1.compareTo(scratch2);
            if (cmp == 0) {
                // tie break on whole row
                return o1.compareTo(o2);
            } else {
                return cmp;
            }
        }
    });
    String sorted;
    boolean success = false;
    try {
        sorted = sorter.sort(unsorted.getName());
        success = true;
    } finally {
        if (success) {
            tempDir.deleteFile(unsorted.getName());
        } else {
            IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
        }
    }
    boolean success2 = false;
    try (ByteSequencesReader reader = new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
        // TODO: the flags themselves can be double-chars (long) or also numeric
        // either way the trick is to encode them as char... but they must be parsed differently
        String currentEntry = null;
        IntsRefBuilder currentOrds = new IntsRefBuilder();
        while (true) {
            BytesRef scratch = reader.next();
            if (scratch == null) {
                break;
            }
            String line = scratch.utf8ToString();
            String entry;
            char[] wordForm;
            int end;
            int flagSep = line.indexOf(FLAG_SEPARATOR);
            if (flagSep == -1) {
                wordForm = NOFLAGS;
                end = line.indexOf(MORPH_SEPARATOR);
                entry = line.substring(0, end);
            } else {
                end = line.indexOf(MORPH_SEPARATOR);
                String flagPart = line.substring(flagSep + 1, end);
                if (aliasCount > 0) {
                    flagPart = getAliasValue(Integer.parseInt(flagPart));
                }
                wordForm = flagParsingStrategy.parseFlags(flagPart);
                Arrays.sort(wordForm);
                entry = line.substring(0, flagSep);
            }
            // we possibly have morphological data
            int stemExceptionID = 0;
            if (hasStemExceptions && end + 1 < line.length()) {
                String stemException = parseStemException(line.substring(end + 1));
                if (stemException != null) {
                    if (stemExceptionCount == stemExceptions.length) {
                        int newSize = ArrayUtil.oversize(stemExceptionCount + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
                        stemExceptions = Arrays.copyOf(stemExceptions, newSize);
                    }
                    // we use '0' to indicate no exception for the form
                    stemExceptionID = stemExceptionCount + 1;
                    stemExceptions[stemExceptionCount++] = stemException;
                }
            }
            int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
            if (cmp < 0) {
                throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
            } else {
                encodeFlags(flagsScratch, wordForm);
                int ord = flagLookup.add(flagsScratch.get());
                if (ord < 0) {
                    // already exists in our hash
                    ord = (-ord) - 1;
                }
                // finalize current entry, and switch "current" if necessary
                if (cmp > 0 && currentEntry != null) {
                    Util.toUTF32(currentEntry, scratchInts);
                    words.add(scratchInts.get(), currentOrds.get());
                }
                // swap current
                if (cmp > 0 || currentEntry == null) {
                    currentEntry = entry;
                    // must be this way
                    currentOrds = new IntsRefBuilder();
                }
                if (hasStemExceptions) {
                    currentOrds.append(ord);
                    currentOrds.append(stemExceptionID);
                } else {
                    currentOrds.append(ord);
                }
            }
        }
        // finalize last entry
        Util.toUTF32(currentEntry, scratchInts);
        words.add(scratchInts.get(), currentOrds.get());
        success2 = true;
    } finally {
        if (success2) {
            tempDir.deleteFile(sorted);
        } else {
            IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
        }
    }
}
Also used : OfflineSorter(org.apache.lucene.util.OfflineSorter) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) InputStreamReader(java.io.InputStreamReader) BufferedInputStream(java.io.BufferedInputStream) InputStream(java.io.InputStream) IndexOutput(org.apache.lucene.store.IndexOutput) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ByteSequencesReader(org.apache.lucene.util.OfflineSorter.ByteSequencesReader) BufferedReader(java.io.BufferedReader) ByteSequencesWriter(org.apache.lucene.util.OfflineSorter.ByteSequencesWriter) BytesRef(org.apache.lucene.util.BytesRef)

Example 3 with OfflineSorter

use of org.apache.lucene.util.OfflineSorter in project lucene-solr by apache.

the class SortedInputIterator method sort.

private ByteSequencesReader sort() throws IOException {
    OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, tieBreakByCostComparator);
    tempInput = tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
    try (OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput)) {
        BytesRef spare;
        byte[] buffer = new byte[0];
        ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
        while ((spare = source.next()) != null) {
            encode(writer, output, buffer, spare, source.payload(), source.contexts(), source.weight());
        }
        CodecUtil.writeFooter(tempInput);
    }
    tempSortedFileName = sorter.sort(tempInput.getName());
    return new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
}
Also used : OfflineSorter(org.apache.lucene.util.OfflineSorter) ByteSequencesWriter(org.apache.lucene.util.OfflineSorter.ByteSequencesWriter) ByteSequencesReader(org.apache.lucene.util.OfflineSorter.ByteSequencesReader) ByteArrayDataOutput(org.apache.lucene.store.ByteArrayDataOutput) ByteSequencesWriter(org.apache.lucene.util.OfflineSorter.ByteSequencesWriter) BytesRef(org.apache.lucene.util.BytesRef)

Example 4 with OfflineSorter

use of org.apache.lucene.util.OfflineSorter in project lucene-solr by apache.

the class AnalyzingSuggester method build.

@Override
public void build(InputIterator iterator) throws IOException {
    if (iterator.hasContexts()) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    hasPayloads = iterator.hasPayloads();
    OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, new AnalyzingComparator(hasPayloads));
    IndexOutput tempInput = tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
    OfflineSorter.ByteSequencesReader reader = null;
    BytesRefBuilder scratch = new BytesRefBuilder();
    TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
    String tempSortedFileName = null;
    count = 0;
    byte[] buffer = new byte[8];
    try {
        ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
        for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null; ) {
            LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
            for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
                Util.toBytesRef(string, scratch);
                // length of the analyzed text (FST input)
                if (scratch.length() > Short.MAX_VALUE - 2) {
                    throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE - 2) + " in length (got " + scratch.length() + ")");
                }
                short analyzedLength = (short) scratch.length();
                // compute the required length:
                // analyzed sequence + weight (4) + surface + analyzedLength (short)
                int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
                BytesRef payload;
                if (hasPayloads) {
                    if (surfaceForm.length > (Short.MAX_VALUE - 2)) {
                        throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE - 2) + " in length (got " + surfaceForm.length + ")");
                    }
                    payload = iterator.payload();
                    // payload + surfaceLength (short)
                    requiredLength += payload.length + 2;
                } else {
                    payload = null;
                }
                buffer = ArrayUtil.grow(buffer, requiredLength);
                output.reset(buffer);
                output.writeShort(analyzedLength);
                output.writeBytes(scratch.bytes(), 0, scratch.length());
                output.writeInt(encodeWeight(iterator.weight()));
                if (hasPayloads) {
                    for (int i = 0; i < surfaceForm.length; i++) {
                        if (surfaceForm.bytes[i] == PAYLOAD_SEP) {
                            throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
                        }
                    }
                    output.writeShort((short) surfaceForm.length);
                    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
                    output.writeBytes(payload.bytes, payload.offset, payload.length);
                } else {
                    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
                }
                assert output.getPosition() == requiredLength : output.getPosition() + " vs " + requiredLength;
                writer.write(buffer, 0, output.getPosition());
            }
            maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size());
        }
        CodecUtil.writeFooter(tempInput);
        writer.close();
        // Sort all input/output pairs (required by FST.Builder):
        tempSortedFileName = sorter.sort(tempInput.getName());
        // Free disk space:
        tempDir.deleteFile(tempInput.getName());
        reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
        PairOutputs<Long, BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
        Builder<Pair<Long, BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
        // Build FST:
        BytesRefBuilder previousAnalyzed = null;
        BytesRefBuilder analyzed = new BytesRefBuilder();
        BytesRef surface = new BytesRef();
        IntsRefBuilder scratchInts = new IntsRefBuilder();
        ByteArrayDataInput input = new ByteArrayDataInput();
        // Used to remove duplicate surface forms (but we
        // still index the hightest-weight one).  We clear
        // this when we see a new analyzed form, so it cannot
        // grow unbounded (at most 256 entries):
        Set<BytesRef> seenSurfaceForms = new HashSet<>();
        int dedup = 0;
        while (true) {
            BytesRef bytes = reader.next();
            if (bytes == null) {
                break;
            }
            input.reset(bytes.bytes, bytes.offset, bytes.length);
            short analyzedLength = input.readShort();
            analyzed.grow(analyzedLength + 2);
            input.readBytes(analyzed.bytes(), 0, analyzedLength);
            analyzed.setLength(analyzedLength);
            long cost = input.readInt();
            surface.bytes = bytes.bytes;
            if (hasPayloads) {
                surface.length = input.readShort();
                surface.offset = input.getPosition();
            } else {
                surface.offset = input.getPosition();
                surface.length = bytes.length - surface.offset;
            }
            if (previousAnalyzed == null) {
                previousAnalyzed = new BytesRefBuilder();
                previousAnalyzed.copyBytes(analyzed.get());
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            } else if (analyzed.get().equals(previousAnalyzed.get())) {
                dedup++;
                if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
                    // dups: skip the rest:
                    continue;
                }
                if (seenSurfaceForms.contains(surface)) {
                    continue;
                }
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            } else {
                dedup = 0;
                previousAnalyzed.copyBytes(analyzed);
                seenSurfaceForms.clear();
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            }
            // TODO: I think we can avoid the extra 2 bytes when
            // there is no dup (dedup==0), but we'd have to fix
            // the exactFirst logic ... which would be sort of
            // hairy because we'd need to special case the two
            // (dup/not dup)...
            // NOTE: must be byte 0 so we sort before whatever
            // is next
            analyzed.append((byte) 0);
            analyzed.append((byte) dedup);
            Util.toIntsRef(analyzed.get(), scratchInts);
            //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
            if (!hasPayloads) {
                builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
            } else {
                int payloadOffset = input.getPosition() + surface.length;
                int payloadLength = bytes.length - payloadOffset;
                BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
                System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
                br.bytes[surface.length] = PAYLOAD_SEP;
                System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length + 1, payloadLength);
                br.length = br.bytes.length;
                builder.add(scratchInts.get(), outputs.newPair(cost, br));
            }
        }
        fst = builder.finish();
    //Util.dotToFile(fst, "/tmp/suggest.dot");
    } finally {
        IOUtils.closeWhileHandlingException(reader, writer);
        IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
    }
}
Also used : OfflineSorter(org.apache.lucene.util.OfflineSorter) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) Builder(org.apache.lucene.util.fst.Builder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) PairOutputs(org.apache.lucene.util.fst.PairOutputs) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) HashSet(java.util.HashSet) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IndexOutput(org.apache.lucene.store.IndexOutput) LimitedFiniteStringsIterator(org.apache.lucene.util.automaton.LimitedFiniteStringsIterator) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) ByteArrayDataOutput(org.apache.lucene.store.ByteArrayDataOutput) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton)

Example 5 with OfflineSorter

use of org.apache.lucene.util.OfflineSorter in project lucene-solr by apache.

the class FSTCompletionLookup method build.

@Override
public void build(InputIterator iterator) throws IOException {
    if (iterator.hasPayloads()) {
        throw new IllegalArgumentException("this suggester doesn't support payloads");
    }
    if (iterator.hasContexts()) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix);
    ExternalRefSorter externalSorter = new ExternalRefSorter(sorter);
    IndexOutput tempInput = tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
    String tempSortedFileName = null;
    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
    OfflineSorter.ByteSequencesReader reader = null;
    // Push floats up front before sequences to sort them. For now, assume they are non-negative.
    // If negative floats are allowed some trickery needs to be done to find their byte order.
    count = 0;
    try {
        byte[] buffer = new byte[0];
        ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
        BytesRef spare;
        int inputLineCount = 0;
        while ((spare = iterator.next()) != null) {
            if (spare.length + 4 >= buffer.length) {
                buffer = ArrayUtil.grow(buffer, spare.length + 4);
            }
            output.reset(buffer);
            output.writeInt(encodeWeight(iterator.weight()));
            output.writeBytes(spare.bytes, spare.offset, spare.length);
            writer.write(buffer, 0, output.getPosition());
            inputLineCount++;
        }
        CodecUtil.writeFooter(tempInput);
        writer.close();
        // We don't know the distribution of scores and we need to bucket them, so we'll sort
        // and divide into equal buckets.
        tempSortedFileName = sorter.sort(tempInput.getName());
        tempDir.deleteFile(tempInput.getName());
        FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, externalSorter, sharedTailLength);
        reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
        long line = 0;
        int previousBucket = 0;
        int previousScore = 0;
        ByteArrayDataInput input = new ByteArrayDataInput();
        BytesRef tmp2 = new BytesRef();
        while (true) {
            BytesRef scratch = reader.next();
            if (scratch == null) {
                break;
            }
            input.reset(scratch.bytes, scratch.offset, scratch.length);
            int currentScore = input.readInt();
            int bucket;
            if (line > 0 && currentScore == previousScore) {
                bucket = previousBucket;
            } else {
                bucket = (int) (line * buckets / inputLineCount);
            }
            previousScore = currentScore;
            previousBucket = bucket;
            // Only append the input, discard the weight.
            tmp2.bytes = scratch.bytes;
            tmp2.offset = scratch.offset + input.getPosition();
            tmp2.length = scratch.length - input.getPosition();
            builder.add(tmp2, bucket);
            line++;
            count++;
        }
        // The two FSTCompletions share the same automaton.
        this.higherWeightsCompletion = builder.build();
        this.normalCompletion = new FSTCompletion(higherWeightsCompletion.getFST(), false, exactMatchFirst);
    } finally {
        IOUtils.closeWhileHandlingException(reader, writer, externalSorter);
        IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
    }
}
Also used : OfflineSorter(org.apache.lucene.util.OfflineSorter) IndexOutput(org.apache.lucene.store.IndexOutput) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) ByteArrayDataOutput(org.apache.lucene.store.ByteArrayDataOutput) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

OfflineSorter (org.apache.lucene.util.OfflineSorter)8 BytesRef (org.apache.lucene.util.BytesRef)7 IndexOutput (org.apache.lucene.store.IndexOutput)6 ByteArrayDataOutput (org.apache.lucene.store.ByteArrayDataOutput)4 ByteArrayDataInput (org.apache.lucene.store.ByteArrayDataInput)3 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)3 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)3 HashSet (java.util.HashSet)2 TokenStreamToAutomaton (org.apache.lucene.analysis.TokenStreamToAutomaton)2 ChecksumIndexInput (org.apache.lucene.store.ChecksumIndexInput)2 Directory (org.apache.lucene.store.Directory)2 BytesRefComparator (org.apache.lucene.util.BytesRefComparator)2 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)2 IntsRef (org.apache.lucene.util.IntsRef)2 ByteSequencesReader (org.apache.lucene.util.OfflineSorter.ByteSequencesReader)2 ByteSequencesWriter (org.apache.lucene.util.OfflineSorter.ByteSequencesWriter)2 LimitedFiniteStringsIterator (org.apache.lucene.util.automaton.LimitedFiniteStringsIterator)2 Builder (org.apache.lucene.util.fst.Builder)2 PairOutputs (org.apache.lucene.util.fst.PairOutputs)2 Pair (org.apache.lucene.util.fst.PairOutputs.Pair)2