Search in sources :

Example 26 with IndexOutput

use of org.apache.lucene.store.IndexOutput in project lucene-solr by apache.

the class TestOfflineSorter method testOverNexting.

// OfflineSorter should not call my BytesSequencesReader.next() again after it already returned null:
public void testOverNexting() throws Exception {
    Directory dir = newDirectory();
    IndexOutput out = dir.createTempOutput("unsorted", "tmp", IOContext.DEFAULT);
    try (ByteSequencesWriter w = new OfflineSorter.ByteSequencesWriter(out)) {
        byte[] bytes = new byte[Integer.BYTES];
        random().nextBytes(bytes);
        w.write(bytes);
        CodecUtil.writeFooter(out);
    }
    new OfflineSorter(dir, "foo", OfflineSorter.DEFAULT_COMPARATOR, BufferSize.megabytes(4), OfflineSorter.MAX_TEMPFILES, Integer.BYTES, null, 0) {

        @Override
        protected ByteSequencesReader getReader(ChecksumIndexInput in, String name) throws IOException {
            ByteSequencesReader other = super.getReader(in, name);
            return new ByteSequencesReader(in, name) {

                private boolean alreadyEnded;

                @Override
                public BytesRef next() throws IOException {
                    // if we returned null already, OfflineSorter should not call next() again
                    assertFalse(alreadyEnded);
                    BytesRef result = other.next();
                    if (result == null) {
                        alreadyEnded = true;
                    }
                    return result;
                }

                @Override
                public void close() throws IOException {
                    other.close();
                }
            };
        }
    }.sort(out.getName());
    dir.close();
}
Also used : ChecksumIndexInput(org.apache.lucene.store.ChecksumIndexInput) CorruptingIndexOutput(org.apache.lucene.store.CorruptingIndexOutput) IndexOutput(org.apache.lucene.store.IndexOutput) IOException(java.io.IOException) ByteSequencesWriter(org.apache.lucene.util.OfflineSorter.ByteSequencesWriter) FilterDirectory(org.apache.lucene.store.FilterDirectory) Directory(org.apache.lucene.store.Directory)

Example 27 with IndexOutput

use of org.apache.lucene.store.IndexOutput in project lucene-solr by apache.

the class AnalyzingSuggester method build.

@Override
public void build(InputIterator iterator) throws IOException {
    if (iterator.hasContexts()) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    hasPayloads = iterator.hasPayloads();
    OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, new AnalyzingComparator(hasPayloads));
    IndexOutput tempInput = tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
    OfflineSorter.ByteSequencesReader reader = null;
    BytesRefBuilder scratch = new BytesRefBuilder();
    TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
    String tempSortedFileName = null;
    count = 0;
    byte[] buffer = new byte[8];
    try {
        ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
        for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null; ) {
            LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
            for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
                Util.toBytesRef(string, scratch);
                // length of the analyzed text (FST input)
                if (scratch.length() > Short.MAX_VALUE - 2) {
                    throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE - 2) + " in length (got " + scratch.length() + ")");
                }
                short analyzedLength = (short) scratch.length();
                // compute the required length:
                // analyzed sequence + weight (4) + surface + analyzedLength (short)
                int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;
                BytesRef payload;
                if (hasPayloads) {
                    if (surfaceForm.length > (Short.MAX_VALUE - 2)) {
                        throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE - 2) + " in length (got " + surfaceForm.length + ")");
                    }
                    payload = iterator.payload();
                    // payload + surfaceLength (short)
                    requiredLength += payload.length + 2;
                } else {
                    payload = null;
                }
                buffer = ArrayUtil.grow(buffer, requiredLength);
                output.reset(buffer);
                output.writeShort(analyzedLength);
                output.writeBytes(scratch.bytes(), 0, scratch.length());
                output.writeInt(encodeWeight(iterator.weight()));
                if (hasPayloads) {
                    for (int i = 0; i < surfaceForm.length; i++) {
                        if (surfaceForm.bytes[i] == PAYLOAD_SEP) {
                            throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
                        }
                    }
                    output.writeShort((short) surfaceForm.length);
                    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
                    output.writeBytes(payload.bytes, payload.offset, payload.length);
                } else {
                    output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
                }
                assert output.getPosition() == requiredLength : output.getPosition() + " vs " + requiredLength;
                writer.write(buffer, 0, output.getPosition());
            }
            maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size());
        }
        CodecUtil.writeFooter(tempInput);
        writer.close();
        // Sort all input/output pairs (required by FST.Builder):
        tempSortedFileName = sorter.sort(tempInput.getName());
        // Free disk space:
        tempDir.deleteFile(tempInput.getName());
        reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
        PairOutputs<Long, BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
        Builder<Pair<Long, BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
        // Build FST:
        BytesRefBuilder previousAnalyzed = null;
        BytesRefBuilder analyzed = new BytesRefBuilder();
        BytesRef surface = new BytesRef();
        IntsRefBuilder scratchInts = new IntsRefBuilder();
        ByteArrayDataInput input = new ByteArrayDataInput();
        // Used to remove duplicate surface forms (but we
        // still index the hightest-weight one).  We clear
        // this when we see a new analyzed form, so it cannot
        // grow unbounded (at most 256 entries):
        Set<BytesRef> seenSurfaceForms = new HashSet<>();
        int dedup = 0;
        while (true) {
            BytesRef bytes = reader.next();
            if (bytes == null) {
                break;
            }
            input.reset(bytes.bytes, bytes.offset, bytes.length);
            short analyzedLength = input.readShort();
            analyzed.grow(analyzedLength + 2);
            input.readBytes(analyzed.bytes(), 0, analyzedLength);
            analyzed.setLength(analyzedLength);
            long cost = input.readInt();
            surface.bytes = bytes.bytes;
            if (hasPayloads) {
                surface.length = input.readShort();
                surface.offset = input.getPosition();
            } else {
                surface.offset = input.getPosition();
                surface.length = bytes.length - surface.offset;
            }
            if (previousAnalyzed == null) {
                previousAnalyzed = new BytesRefBuilder();
                previousAnalyzed.copyBytes(analyzed.get());
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            } else if (analyzed.get().equals(previousAnalyzed.get())) {
                dedup++;
                if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
                    // dups: skip the rest:
                    continue;
                }
                if (seenSurfaceForms.contains(surface)) {
                    continue;
                }
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            } else {
                dedup = 0;
                previousAnalyzed.copyBytes(analyzed);
                seenSurfaceForms.clear();
                seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
            }
            // TODO: I think we can avoid the extra 2 bytes when
            // there is no dup (dedup==0), but we'd have to fix
            // the exactFirst logic ... which would be sort of
            // hairy because we'd need to special case the two
            // (dup/not dup)...
            // NOTE: must be byte 0 so we sort before whatever
            // is next
            analyzed.append((byte) 0);
            analyzed.append((byte) dedup);
            Util.toIntsRef(analyzed.get(), scratchInts);
            //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
            if (!hasPayloads) {
                builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
            } else {
                int payloadOffset = input.getPosition() + surface.length;
                int payloadLength = bytes.length - payloadOffset;
                BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
                System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
                br.bytes[surface.length] = PAYLOAD_SEP;
                System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length + 1, payloadLength);
                br.length = br.bytes.length;
                builder.add(scratchInts.get(), outputs.newPair(cost, br));
            }
        }
        fst = builder.finish();
    //Util.dotToFile(fst, "/tmp/suggest.dot");
    } finally {
        IOUtils.closeWhileHandlingException(reader, writer);
        IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
    }
}
Also used : OfflineSorter(org.apache.lucene.util.OfflineSorter) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) Builder(org.apache.lucene.util.fst.Builder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) PairOutputs(org.apache.lucene.util.fst.PairOutputs) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) Pair(org.apache.lucene.util.fst.PairOutputs.Pair) HashSet(java.util.HashSet) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IndexOutput(org.apache.lucene.store.IndexOutput) LimitedFiniteStringsIterator(org.apache.lucene.util.automaton.LimitedFiniteStringsIterator) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ByteArrayDataInput(org.apache.lucene.store.ByteArrayDataInput) ByteArrayDataOutput(org.apache.lucene.store.ByteArrayDataOutput) TokenStreamToAutomaton(org.apache.lucene.analysis.TokenStreamToAutomaton)

Example 28 with IndexOutput

use of org.apache.lucene.store.IndexOutput in project lucene-solr by apache.

the class IndexFetcher method logReplicationTimeAndConfFiles.

/**
   * Helper method to record the last replication's details so that we can show them on the statistics page across
   * restarts.
   * @throws IOException on IO error
   */
@SuppressForbidden(reason = "Need currentTimeMillis for debugging/stats")
private void logReplicationTimeAndConfFiles(Collection<Map<String, Object>> modifiedConfFiles, boolean successfulInstall) throws IOException {
    List<String> confFiles = new ArrayList<>();
    if (modifiedConfFiles != null && !modifiedConfFiles.isEmpty())
        for (Map<String, Object> map1 : modifiedConfFiles) confFiles.add((String) map1.get(NAME));
    Properties props = replicationHandler.loadReplicationProperties();
    long replicationTime = System.currentTimeMillis();
    long replicationTimeTaken = getReplicationTimeElapsed();
    Directory dir = null;
    try {
        dir = solrCore.getDirectoryFactory().get(solrCore.getDataDir(), DirContext.META_DATA, solrCore.getSolrConfig().indexConfig.lockType);
        int indexCount = 1, confFilesCount = 1;
        if (props.containsKey(TIMES_INDEX_REPLICATED)) {
            indexCount = Integer.parseInt(props.getProperty(TIMES_INDEX_REPLICATED)) + 1;
        }
        StringBuilder sb = readToStringBuilder(replicationTime, props.getProperty(INDEX_REPLICATED_AT_LIST));
        props.setProperty(INDEX_REPLICATED_AT_LIST, sb.toString());
        props.setProperty(INDEX_REPLICATED_AT, String.valueOf(replicationTime));
        props.setProperty(PREVIOUS_CYCLE_TIME_TAKEN, String.valueOf(replicationTimeTaken));
        props.setProperty(TIMES_INDEX_REPLICATED, String.valueOf(indexCount));
        if (modifiedConfFiles != null && !modifiedConfFiles.isEmpty()) {
            props.setProperty(CONF_FILES_REPLICATED, confFiles.toString());
            props.setProperty(CONF_FILES_REPLICATED_AT, String.valueOf(replicationTime));
            if (props.containsKey(TIMES_CONFIG_REPLICATED)) {
                confFilesCount = Integer.parseInt(props.getProperty(TIMES_CONFIG_REPLICATED)) + 1;
            }
            props.setProperty(TIMES_CONFIG_REPLICATED, String.valueOf(confFilesCount));
        }
        props.setProperty(LAST_CYCLE_BYTES_DOWNLOADED, String.valueOf(getTotalBytesDownloaded()));
        if (!successfulInstall) {
            int numFailures = 1;
            if (props.containsKey(TIMES_FAILED)) {
                numFailures = Integer.parseInt(props.getProperty(TIMES_FAILED)) + 1;
            }
            props.setProperty(TIMES_FAILED, String.valueOf(numFailures));
            props.setProperty(REPLICATION_FAILED_AT, String.valueOf(replicationTime));
            sb = readToStringBuilder(replicationTime, props.getProperty(REPLICATION_FAILED_AT_LIST));
            props.setProperty(REPLICATION_FAILED_AT_LIST, sb.toString());
        }
        String tmpFileName = REPLICATION_PROPERTIES + "." + System.nanoTime();
        final IndexOutput out = dir.createOutput(tmpFileName, DirectoryFactory.IOCONTEXT_NO_CACHE);
        Writer outFile = new OutputStreamWriter(new PropertiesOutputStream(out), StandardCharsets.UTF_8);
        try {
            props.store(outFile, "Replication details");
            dir.sync(Collections.singleton(tmpFileName));
        } finally {
            IOUtils.closeQuietly(outFile);
        }
        solrCore.getDirectoryFactory().renameWithOverwrite(dir, tmpFileName, REPLICATION_PROPERTIES);
    } catch (Exception e) {
        LOG.warn("Exception while updating statistics", e);
    } finally {
        if (dir != null) {
            solrCore.getDirectoryFactory().release(dir);
        }
    }
}
Also used : ArrayList(java.util.ArrayList) IndexOutput(org.apache.lucene.store.IndexOutput) Properties(java.util.Properties) NoSuchFileException(java.nio.file.NoSuchFileException) SolrServerException(org.apache.solr.client.solrj.SolrServerException) SolrException(org.apache.solr.common.SolrException) FileNotFoundException(java.io.FileNotFoundException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) OutputStreamWriter(java.io.OutputStreamWriter) Map(java.util.Map) HashMap(java.util.HashMap) IndexWriter(org.apache.lucene.index.IndexWriter) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter) Directory(org.apache.lucene.store.Directory) PropertiesOutputStream(org.apache.solr.util.PropertiesOutputStream) SuppressForbidden(org.apache.solr.common.util.SuppressForbidden)

Example 29 with IndexOutput

use of org.apache.lucene.store.IndexOutput in project lucene-solr by apache.

the class TestPackedInts method testEndPointer.

public void testEndPointer() throws IOException {
    final Directory dir = newDirectory();
    final int valueCount = RandomNumbers.randomIntBetween(random(), 1, 1000);
    final IndexOutput out = dir.createOutput("tests.bin", newIOContext(random()));
    for (int i = 0; i < valueCount; ++i) {
        out.writeLong(0);
    }
    out.close();
    final IndexInput in = dir.openInput("tests.bin", newIOContext(random()));
    for (int version = PackedInts.VERSION_START; version <= PackedInts.VERSION_CURRENT; ++version) {
        for (int bpv = 1; bpv <= 64; ++bpv) {
            for (PackedInts.Format format : PackedInts.Format.values()) {
                if (!format.isSupported(bpv)) {
                    continue;
                }
                final long byteCount = format.byteCount(version, valueCount, bpv);
                String msg = "format=" + format + ",version=" + version + ",valueCount=" + valueCount + ",bpv=" + bpv;
                // test iterator
                in.seek(0L);
                final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(in, format, version, valueCount, bpv, RandomNumbers.randomIntBetween(random(), 1, 1 << 16));
                for (int i = 0; i < valueCount; ++i) {
                    it.next();
                }
                assertEquals(msg, byteCount, in.getFilePointer());
                // test direct reader
                in.seek(0L);
                final PackedInts.Reader directReader = PackedInts.getDirectReaderNoHeader(in, format, version, valueCount, bpv);
                directReader.get(valueCount - 1);
                assertEquals(msg, byteCount, in.getFilePointer());
                // test reader
                in.seek(0L);
                PackedInts.getReaderNoHeader(in, format, version, valueCount, bpv);
                assertEquals(msg, byteCount, in.getFilePointer());
            }
        }
    }
    in.close();
    dir.close();
}
Also used : Reader(org.apache.lucene.util.packed.PackedInts.Reader) IndexInput(org.apache.lucene.store.IndexInput) IndexOutput(org.apache.lucene.store.IndexOutput) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory)

Example 30 with IndexOutput

use of org.apache.lucene.store.IndexOutput in project lucene-solr by apache.

the class TestPackedInts method testSave.

public void testSave() throws IOException {
    final int valueCount = TestUtil.nextInt(random(), 1, 2048);
    for (int bpv = 1; bpv <= 64; ++bpv) {
        final int maxValue = (int) Math.min(PackedInts.maxValue(31), PackedInts.maxValue(bpv));
        final RAMDirectory directory = new RAMDirectory();
        List<PackedInts.Mutable> packedInts = createPackedInts(valueCount, bpv);
        for (PackedInts.Mutable mutable : packedInts) {
            for (int i = 0; i < mutable.size(); ++i) {
                mutable.set(i, random().nextInt(maxValue));
            }
            IndexOutput out = directory.createOutput("packed-ints.bin", IOContext.DEFAULT);
            mutable.save(out);
            out.close();
            IndexInput in = directory.openInput("packed-ints.bin", IOContext.DEFAULT);
            PackedInts.Reader reader = PackedInts.getReader(in);
            assertEquals(valueCount, reader.size());
            if (mutable instanceof Packed64SingleBlock) {
                // make sure that we used the right format so that the reader has
                // the same performance characteristics as the mutable that has been
                // serialized
                assertTrue(reader instanceof Packed64SingleBlock);
            } else {
                assertFalse(reader instanceof Packed64SingleBlock);
            }
            for (int i = 0; i < valueCount; ++i) {
                assertEquals(mutable.get(i), reader.get(i));
            }
            in.close();
            directory.deleteFile("packed-ints.bin");
        }
        directory.close();
    }
}
Also used : Reader(org.apache.lucene.util.packed.PackedInts.Reader) IndexInput(org.apache.lucene.store.IndexInput) IndexOutput(org.apache.lucene.store.IndexOutput) RAMDirectory(org.apache.lucene.store.RAMDirectory)

Aggregations

IndexOutput (org.apache.lucene.store.IndexOutput)182 Directory (org.apache.lucene.store.Directory)79 IndexInput (org.apache.lucene.store.IndexInput)76 RAMDirectory (org.apache.lucene.store.RAMDirectory)36 FilterDirectory (org.apache.lucene.store.FilterDirectory)34 CorruptIndexException (org.apache.lucene.index.CorruptIndexException)27 ChecksumIndexInput (org.apache.lucene.store.ChecksumIndexInput)27 BytesRef (org.apache.lucene.util.BytesRef)26 IOException (java.io.IOException)20 CorruptingIndexOutput (org.apache.lucene.store.CorruptingIndexOutput)18 RAMFile (org.apache.lucene.store.RAMFile)16 RAMOutputStream (org.apache.lucene.store.RAMOutputStream)16 IndexFormatTooNewException (org.apache.lucene.index.IndexFormatTooNewException)14 IndexFormatTooOldException (org.apache.lucene.index.IndexFormatTooOldException)14 IOContext (org.apache.lucene.store.IOContext)13 ArrayList (java.util.ArrayList)11 BufferedChecksumIndexInput (org.apache.lucene.store.BufferedChecksumIndexInput)11 RAMInputStream (org.apache.lucene.store.RAMInputStream)11 NIOFSDirectory (org.apache.lucene.store.NIOFSDirectory)10 NRTCachingDirectory (org.apache.lucene.store.NRTCachingDirectory)10