Search in sources :

Example 1 with ByteSequencesWriter

use of org.apache.lucene.util.OfflineSorter.ByteSequencesWriter in project lucene-solr by apache.

the class Dictionary method readDictionaryFiles.

/**
   * Reads the dictionary file through the provided InputStreams, building up the words map
   *
   * @param dictionaries InputStreams to read the dictionary file through
   * @param decoder CharsetDecoder used to decode the contents of the file
   * @throws IOException Can be thrown while reading from the file
   */
private void readDictionaryFiles(Directory tempDir, String tempFileNamePrefix, List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException {
    BytesRefBuilder flagsScratch = new BytesRefBuilder();
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    StringBuilder sb = new StringBuilder();
    IndexOutput unsorted = tempDir.createTempOutput(tempFileNamePrefix, "dat", IOContext.DEFAULT);
    try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
        for (InputStream dictionary : dictionaries) {
            BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
            // first line is number of entries (approximately, sometimes)
            String line = lines.readLine();
            while ((line = lines.readLine()) != null) {
                // wild and unpredictable code comment rules
                if (line.isEmpty() || line.charAt(0) == '/' || line.charAt(0) == '#' || line.charAt(0) == '\t') {
                    continue;
                }
                line = unescapeEntry(line);
                // if we havent seen any stem exceptions, try to parse one
                if (hasStemExceptions == false) {
                    int morphStart = line.indexOf(MORPH_SEPARATOR);
                    if (morphStart >= 0 && morphStart < line.length()) {
                        hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
                    }
                }
                if (needsInputCleaning) {
                    int flagSep = line.indexOf(FLAG_SEPARATOR);
                    if (flagSep == -1) {
                        flagSep = line.indexOf(MORPH_SEPARATOR);
                    }
                    if (flagSep == -1) {
                        CharSequence cleansed = cleanInput(line, sb);
                        writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
                    } else {
                        String text = line.substring(0, flagSep);
                        CharSequence cleansed = cleanInput(text, sb);
                        if (cleansed != sb) {
                            sb.setLength(0);
                            sb.append(cleansed);
                        }
                        sb.append(line.substring(flagSep));
                        writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
                    }
                } else {
                    writer.write(line.getBytes(StandardCharsets.UTF_8));
                }
            }
        }
        CodecUtil.writeFooter(unsorted);
    }
    OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, new Comparator<BytesRef>() {

        BytesRef scratch1 = new BytesRef();

        BytesRef scratch2 = new BytesRef();

        @Override
        public int compare(BytesRef o1, BytesRef o2) {
            scratch1.bytes = o1.bytes;
            scratch1.offset = o1.offset;
            scratch1.length = o1.length;
            for (int i = scratch1.length - 1; i >= 0; i--) {
                if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR || scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
                    scratch1.length = i;
                    break;
                }
            }
            scratch2.bytes = o2.bytes;
            scratch2.offset = o2.offset;
            scratch2.length = o2.length;
            for (int i = scratch2.length - 1; i >= 0; i--) {
                if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR || scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
                    scratch2.length = i;
                    break;
                }
            }
            int cmp = scratch1.compareTo(scratch2);
            if (cmp == 0) {
                // tie break on whole row
                return o1.compareTo(o2);
            } else {
                return cmp;
            }
        }
    });
    String sorted;
    boolean success = false;
    try {
        sorted = sorter.sort(unsorted.getName());
        success = true;
    } finally {
        if (success) {
            tempDir.deleteFile(unsorted.getName());
        } else {
            IOUtils.deleteFilesIgnoringExceptions(tempDir, unsorted.getName());
        }
    }
    boolean success2 = false;
    try (ByteSequencesReader reader = new ByteSequencesReader(tempDir.openChecksumInput(sorted, IOContext.READONCE), sorted)) {
        // TODO: the flags themselves can be double-chars (long) or also numeric
        // either way the trick is to encode them as char... but they must be parsed differently
        String currentEntry = null;
        IntsRefBuilder currentOrds = new IntsRefBuilder();
        while (true) {
            BytesRef scratch = reader.next();
            if (scratch == null) {
                break;
            }
            String line = scratch.utf8ToString();
            String entry;
            char[] wordForm;
            int end;
            int flagSep = line.indexOf(FLAG_SEPARATOR);
            if (flagSep == -1) {
                wordForm = NOFLAGS;
                end = line.indexOf(MORPH_SEPARATOR);
                entry = line.substring(0, end);
            } else {
                end = line.indexOf(MORPH_SEPARATOR);
                String flagPart = line.substring(flagSep + 1, end);
                if (aliasCount > 0) {
                    flagPart = getAliasValue(Integer.parseInt(flagPart));
                }
                wordForm = flagParsingStrategy.parseFlags(flagPart);
                Arrays.sort(wordForm);
                entry = line.substring(0, flagSep);
            }
            // we possibly have morphological data
            int stemExceptionID = 0;
            if (hasStemExceptions && end + 1 < line.length()) {
                String stemException = parseStemException(line.substring(end + 1));
                if (stemException != null) {
                    if (stemExceptionCount == stemExceptions.length) {
                        int newSize = ArrayUtil.oversize(stemExceptionCount + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
                        stemExceptions = Arrays.copyOf(stemExceptions, newSize);
                    }
                    // we use '0' to indicate no exception for the form
                    stemExceptionID = stemExceptionCount + 1;
                    stemExceptions[stemExceptionCount++] = stemException;
                }
            }
            int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
            if (cmp < 0) {
                throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
            } else {
                encodeFlags(flagsScratch, wordForm);
                int ord = flagLookup.add(flagsScratch.get());
                if (ord < 0) {
                    // already exists in our hash
                    ord = (-ord) - 1;
                }
                // finalize current entry, and switch "current" if necessary
                if (cmp > 0 && currentEntry != null) {
                    Util.toUTF32(currentEntry, scratchInts);
                    words.add(scratchInts.get(), currentOrds.get());
                }
                // swap current
                if (cmp > 0 || currentEntry == null) {
                    currentEntry = entry;
                    // must be this way
                    currentOrds = new IntsRefBuilder();
                }
                if (hasStemExceptions) {
                    currentOrds.append(ord);
                    currentOrds.append(stemExceptionID);
                } else {
                    currentOrds.append(ord);
                }
            }
        }
        // finalize last entry
        Util.toUTF32(currentEntry, scratchInts);
        words.add(scratchInts.get(), currentOrds.get());
        success2 = true;
    } finally {
        if (success2) {
            tempDir.deleteFile(sorted);
        } else {
            IOUtils.deleteFilesIgnoringExceptions(tempDir, sorted);
        }
    }
}
Also used : OfflineSorter(org.apache.lucene.util.OfflineSorter) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) InputStreamReader(java.io.InputStreamReader) BufferedInputStream(java.io.BufferedInputStream) InputStream(java.io.InputStream) IndexOutput(org.apache.lucene.store.IndexOutput) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ByteSequencesReader(org.apache.lucene.util.OfflineSorter.ByteSequencesReader) BufferedReader(java.io.BufferedReader) ByteSequencesWriter(org.apache.lucene.util.OfflineSorter.ByteSequencesWriter) BytesRef(org.apache.lucene.util.BytesRef)

Example 2 with ByteSequencesWriter

use of org.apache.lucene.util.OfflineSorter.ByteSequencesWriter in project lucene-solr by apache.

the class TestOfflineSorter method testOverNexting.

// OfflineSorter should not call my BytesSequencesReader.next() again after it already returned null:
public void testOverNexting() throws Exception {
    Directory dir = newDirectory();
    IndexOutput out = dir.createTempOutput("unsorted", "tmp", IOContext.DEFAULT);
    try (ByteSequencesWriter w = new OfflineSorter.ByteSequencesWriter(out)) {
        byte[] bytes = new byte[Integer.BYTES];
        random().nextBytes(bytes);
        w.write(bytes);
        CodecUtil.writeFooter(out);
    }
    new OfflineSorter(dir, "foo", OfflineSorter.DEFAULT_COMPARATOR, BufferSize.megabytes(4), OfflineSorter.MAX_TEMPFILES, Integer.BYTES, null, 0) {

        @Override
        protected ByteSequencesReader getReader(ChecksumIndexInput in, String name) throws IOException {
            ByteSequencesReader other = super.getReader(in, name);
            return new ByteSequencesReader(in, name) {

                private boolean alreadyEnded;

                @Override
                public BytesRef next() throws IOException {
                    // if we returned null already, OfflineSorter should not call next() again
                    assertFalse(alreadyEnded);
                    BytesRef result = other.next();
                    if (result == null) {
                        alreadyEnded = true;
                    }
                    return result;
                }

                @Override
                public void close() throws IOException {
                    other.close();
                }
            };
        }
    }.sort(out.getName());
    dir.close();
}
Also used : ChecksumIndexInput(org.apache.lucene.store.ChecksumIndexInput) CorruptingIndexOutput(org.apache.lucene.store.CorruptingIndexOutput) IndexOutput(org.apache.lucene.store.IndexOutput) IOException(java.io.IOException) ByteSequencesWriter(org.apache.lucene.util.OfflineSorter.ByteSequencesWriter) FilterDirectory(org.apache.lucene.store.FilterDirectory) Directory(org.apache.lucene.store.Directory)

Example 3 with ByteSequencesWriter

use of org.apache.lucene.util.OfflineSorter.ByteSequencesWriter in project lucene-solr by apache.

the class TestOfflineSorter method testFixedLengthLiesLiesLies.

public void testFixedLengthLiesLiesLies() throws Exception {
    // Make sure OfflineSorter catches me if I lie about the fixed value length:
    Directory dir = newDirectory();
    IndexOutput out = dir.createTempOutput("unsorted", "tmp", IOContext.DEFAULT);
    try (ByteSequencesWriter w = new OfflineSorter.ByteSequencesWriter(out)) {
        byte[] bytes = new byte[Integer.BYTES];
        random().nextBytes(bytes);
        w.write(bytes);
        CodecUtil.writeFooter(out);
    }
    OfflineSorter sorter = new OfflineSorter(dir, "foo", OfflineSorter.DEFAULT_COMPARATOR, BufferSize.megabytes(4), OfflineSorter.MAX_TEMPFILES, Long.BYTES, null, 0);
    IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> {
        sorter.sort(out.getName());
    });
    assertEquals("value length is 4 but is supposed to always be 8", e.getMessage());
    dir.close();
}
Also used : CorruptingIndexOutput(org.apache.lucene.store.CorruptingIndexOutput) IndexOutput(org.apache.lucene.store.IndexOutput) ByteSequencesWriter(org.apache.lucene.util.OfflineSorter.ByteSequencesWriter) FilterDirectory(org.apache.lucene.store.FilterDirectory) Directory(org.apache.lucene.store.Directory)

Example 4 with ByteSequencesWriter

use of org.apache.lucene.util.OfflineSorter.ByteSequencesWriter in project lucene-solr by apache.

the class TestOfflineSorter method testFixedLengthHeap.

public void testFixedLengthHeap() throws Exception {
    // Make sure the RAM accounting is correct, i.e. if we are sorting fixed width
    // ints (4 bytes) then the heap used is really only 4 bytes per value:
    Directory dir = newDirectory();
    IndexOutput out = dir.createTempOutput("unsorted", "tmp", IOContext.DEFAULT);
    try (ByteSequencesWriter w = new OfflineSorter.ByteSequencesWriter(out)) {
        byte[] bytes = new byte[Integer.BYTES];
        for (int i = 0; i < 1024 * 1024; i++) {
            random().nextBytes(bytes);
            w.write(bytes);
        }
        CodecUtil.writeFooter(out);
    }
    ExecutorService exec = randomExecutorServiceOrNull();
    OfflineSorter sorter = new OfflineSorter(dir, "foo", OfflineSorter.DEFAULT_COMPARATOR, BufferSize.megabytes(4), OfflineSorter.MAX_TEMPFILES, Integer.BYTES, exec, TestUtil.nextInt(random(), 1, 4));
    sorter.sort(out.getName());
    if (exec != null) {
        exec.shutdownNow();
    }
    // 1 MB of ints with 4 MH heap allowed should have been sorted in a single heap partition:
    assertEquals(0, sorter.sortInfo.mergeRounds);
    dir.close();
}
Also used : ExecutorService(java.util.concurrent.ExecutorService) CorruptingIndexOutput(org.apache.lucene.store.CorruptingIndexOutput) IndexOutput(org.apache.lucene.store.IndexOutput) ByteSequencesWriter(org.apache.lucene.util.OfflineSorter.ByteSequencesWriter) FilterDirectory(org.apache.lucene.store.FilterDirectory) Directory(org.apache.lucene.store.Directory)

Aggregations

IndexOutput (org.apache.lucene.store.IndexOutput)4 ByteSequencesWriter (org.apache.lucene.util.OfflineSorter.ByteSequencesWriter)4 CorruptingIndexOutput (org.apache.lucene.store.CorruptingIndexOutput)3 Directory (org.apache.lucene.store.Directory)3 FilterDirectory (org.apache.lucene.store.FilterDirectory)3 BufferedInputStream (java.io.BufferedInputStream)1 BufferedReader (java.io.BufferedReader)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 InputStreamReader (java.io.InputStreamReader)1 ExecutorService (java.util.concurrent.ExecutorService)1 ChecksumIndexInput (org.apache.lucene.store.ChecksumIndexInput)1 BytesRef (org.apache.lucene.util.BytesRef)1 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)1 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)1 OfflineSorter (org.apache.lucene.util.OfflineSorter)1 ByteSequencesReader (org.apache.lucene.util.OfflineSorter.ByteSequencesReader)1