Search in sources :

Example 31 with AtomicLong

use of java.util.concurrent.atomic.AtomicLong in project crate by crate.

the class BlobRecoverySourceHandler method phase1.

/**
     * Perform phase1 of the recovery operations. Once this {@link SnapshotIndexCommit}
     * snapshot has been performed no commit operations (files being fsync'd)
     * are effectively allowed on this index until all recovery phases are done
     * <p/>
     * Phase1 examines the segment files on the target node and copies over the
     * segments that are missing. Only segments that have the same size and
     * checksum can be reused
     */
public void phase1(final SnapshotIndexCommit snapshot, final Translog.View translogView) {
    cancellableThreads.checkForCancel();
    // Total size of segment files that are recovered
    long totalSize = 0;
    // Total size of segment files that were able to be re-used
    long existingTotalSize = 0;
    final Store store = shard.store();
    store.incRef();
    try {
        // CRATE CHANGE
        if (blobRecoveryHandler != null) {
            blobRecoveryHandler.phase1();
        }
        StopWatch stopWatch = new StopWatch().start();
        final Store.MetadataSnapshot recoverySourceMetadata;
        try {
            recoverySourceMetadata = store.getMetadata(snapshot);
        } catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
            shard.engine().failEngine("recovery", ex);
            throw ex;
        }
        for (String name : snapshot.getFiles()) {
            final StoreFileMetaData md = recoverySourceMetadata.get(name);
            if (md == null) {
                logger.info("Snapshot differs from actual index for file: {} meta: {}", name, recoverySourceMetadata.asMap());
                throw new CorruptIndexException("Snapshot differs from actual index - maybe index was removed metadata has " + recoverySourceMetadata.asMap().size() + " files", name);
            }
        }
        // Generate a "diff" of all the identical, different, and missing
        // segment files on the target node, using the existing files on
        // the source node
        String recoverySourceSyncId = recoverySourceMetadata.getSyncId();
        String recoveryTargetSyncId = request.metadataSnapshot().getSyncId();
        final boolean recoverWithSyncId = recoverySourceSyncId != null && recoverySourceSyncId.equals(recoveryTargetSyncId);
        if (recoverWithSyncId) {
            final long numDocsTarget = request.metadataSnapshot().getNumDocs();
            final long numDocsSource = recoverySourceMetadata.getNumDocs();
            if (numDocsTarget != numDocsSource) {
                throw new IllegalStateException("try to recover " + request.shardId() + " from primary shard with sync id but number of docs differ: " + numDocsTarget + " (" + request.sourceNode().getName() + ", primary) vs " + numDocsSource + "(" + request.targetNode().getName() + ")");
            }
            // we shortcut recovery here because we have nothing to copy. but we must still start the engine on the target.
            // so we don't return here
            logger.trace("[{}][{}] skipping [phase1] to {} - identical sync id [{}] found on both source and target", indexName, shardId, request.targetNode(), recoverySourceSyncId);
        } else {
            final Store.RecoveryDiff diff = recoverySourceMetadata.recoveryDiff(request.metadataSnapshot());
            for (StoreFileMetaData md : diff.identical) {
                response.phase1ExistingFileNames.add(md.name());
                response.phase1ExistingFileSizes.add(md.length());
                existingTotalSize += md.length();
                if (logger.isTraceEnabled()) {
                    logger.trace("[{}][{}] recovery [phase1] to {}: not recovering [{}], exists in local store and has checksum [{}], size [{}]", indexName, shardId, request.targetNode(), md.name(), md.checksum(), md.length());
                }
                totalSize += md.length();
            }
            for (StoreFileMetaData md : Iterables.concat(diff.different, diff.missing)) {
                if (request.metadataSnapshot().asMap().containsKey(md.name())) {
                    logger.trace("[{}][{}] recovery [phase1] to {}: recovering [{}], exists in local store, but is different: remote [{}], local [{}]", indexName, shardId, request.targetNode(), md.name(), request.metadataSnapshot().asMap().get(md.name()), md);
                } else {
                    logger.trace("[{}][{}] recovery [phase1] to {}: recovering [{}], does not exists in remote", indexName, shardId, request.targetNode(), md.name());
                }
                response.phase1FileNames.add(md.name());
                response.phase1FileSizes.add(md.length());
                totalSize += md.length();
            }
            response.phase1TotalSize = totalSize;
            response.phase1ExistingTotalSize = existingTotalSize;
            logger.trace("[{}][{}] recovery [phase1] to {}: recovering_files [{}] with total_size [{}], reusing_files [{}] with total_size [{}]", indexName, shardId, request.targetNode(), response.phase1FileNames.size(), new ByteSizeValue(totalSize), response.phase1ExistingFileNames.size(), new ByteSizeValue(existingTotalSize));
            cancellableThreads.execute(new Interruptable() {

                @Override
                public void run() throws InterruptedException {
                    RecoveryFilesInfoRequest recoveryInfoFilesRequest = new RecoveryFilesInfoRequest(request.recoveryId(), request.shardId(), response.phase1FileNames, response.phase1FileSizes, response.phase1ExistingFileNames, response.phase1ExistingFileSizes, translogView.totalOperations());
                    transportService.submitRequest(request.targetNode(), RecoveryTarget.Actions.FILES_INFO, recoveryInfoFilesRequest, TransportRequestOptions.builder().withTimeout(recoverySettings.internalActionTimeout()).build(), EmptyTransportResponseHandler.INSTANCE_SAME).txGet();
                }
            });
            // This latch will be used to wait until all files have been transferred to the target node
            final CountDownLatch latch = new CountDownLatch(response.phase1FileNames.size());
            final CopyOnWriteArrayList<Throwable> exceptions = new CopyOnWriteArrayList<>();
            final AtomicReference<Throwable> corruptedEngine = new AtomicReference<>();
            int fileIndex = 0;
            ThreadPoolExecutor pool;
            // How many bytes we've copied since we last called RateLimiter.pause
            final AtomicLong bytesSinceLastPause = new AtomicLong();
            for (final String name : response.phase1FileNames) {
                long fileSize = response.phase1FileSizes.get(fileIndex);
                // separately.
                if (fileSize > RecoverySettings.SMALL_FILE_CUTOFF_BYTES) {
                    pool = recoverySettings.concurrentStreamPool();
                } else {
                    pool = recoverySettings.concurrentSmallFileStreamPool();
                }
                pool.execute(new AbstractRunnable() {

                    @Override
                    public void onFailure(Throwable t) {
                        // we either got rejected or the store can't be incremented / we are canceled
                        logger.debug("Failed to transfer file [" + name + "] on recovery");
                    }

                    @Override
                    public void onAfter() {
                        // Signify this file has completed by decrementing the latch
                        latch.countDown();
                    }

                    @Override
                    protected void doRun() {
                        cancellableThreads.checkForCancel();
                        store.incRef();
                        final StoreFileMetaData md = recoverySourceMetadata.get(name);
                        try (final IndexInput indexInput = store.directory().openInput(name, IOContext.READONCE)) {
                            // at least one!
                            final int BUFFER_SIZE = (int) Math.max(1, recoverySettings.fileChunkSize().getBytes());
                            final byte[] buf = new byte[BUFFER_SIZE];
                            boolean shouldCompressRequest = recoverySettings.compress();
                            if (CompressorFactory.isCompressed(indexInput)) {
                                shouldCompressRequest = false;
                            }
                            final long len = indexInput.length();
                            long readCount = 0;
                            final TransportRequestOptions requestOptions = TransportRequestOptions.builder().withCompress(shouldCompressRequest).withType(TransportRequestOptions.Type.RECOVERY).withTimeout(recoverySettings.internalActionTimeout()).build();
                            while (readCount < len) {
                                if (shard.state() == IndexShardState.CLOSED) {
                                    // check if the shard got closed on us
                                    throw new IndexShardClosedException(shard.shardId());
                                }
                                int toRead = readCount + BUFFER_SIZE > len ? (int) (len - readCount) : BUFFER_SIZE;
                                final long position = indexInput.getFilePointer();
                                // Pause using the rate limiter, if desired, to throttle the recovery
                                RateLimiter rl = recoverySettings.rateLimiter();
                                long throttleTimeInNanos = 0;
                                if (rl != null) {
                                    long bytes = bytesSinceLastPause.addAndGet(toRead);
                                    if (bytes > rl.getMinPauseCheckBytes()) {
                                        // Time to pause
                                        bytesSinceLastPause.addAndGet(-bytes);
                                        throttleTimeInNanos = rl.pause(bytes);
                                        shard.recoveryStats().addThrottleTime(throttleTimeInNanos);
                                    }
                                }
                                indexInput.readBytes(buf, 0, toRead, false);
                                final BytesArray content = new BytesArray(buf, 0, toRead);
                                readCount += toRead;
                                final boolean lastChunk = readCount == len;
                                final RecoveryFileChunkRequest fileChunkRequest = new RecoveryFileChunkRequest(request.recoveryId(), request.shardId(), md, position, content, lastChunk, translogView.totalOperations(), throttleTimeInNanos);
                                cancellableThreads.execute(new Interruptable() {

                                    @Override
                                    public void run() throws InterruptedException {
                                        // Actually send the file chunk to the target node, waiting for it to complete
                                        transportService.submitRequest(request.targetNode(), RecoveryTarget.Actions.FILE_CHUNK, fileChunkRequest, requestOptions, EmptyTransportResponseHandler.INSTANCE_SAME).txGet();
                                    }
                                });
                            }
                        } catch (Throwable e) {
                            final Throwable corruptIndexException;
                            if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(e)) != null) {
                                if (store.checkIntegrityNoException(md) == false) {
                                    // we are corrupted on the primary -- fail!
                                    logger.warn("{} Corrupted file detected {} checksum mismatch", shard.shardId(), md);
                                    if (corruptedEngine.compareAndSet(null, corruptIndexException) == false) {
                                        // if we are not the first exception, add ourselves as suppressed to the main one:
                                        corruptedEngine.get().addSuppressed(e);
                                    }
                                } else {
                                    // corruption has happened on the way to replica
                                    RemoteTransportException exception = new RemoteTransportException("File corruption occurred on recovery but checksums are ok", null);
                                    exception.addSuppressed(e);
                                    // last exception first
                                    exceptions.add(0, exception);
                                    logger.warn("{} Remote file corruption on node {}, recovering {}. local checksum OK", corruptIndexException, shard.shardId(), request.targetNode(), md);
                                }
                            } else {
                                // last exceptions first
                                exceptions.add(0, e);
                            }
                        } finally {
                            store.decRef();
                        }
                    }
                });
                fileIndex++;
            }
            cancellableThreads.execute(new Interruptable() {

                @Override
                public void run() throws InterruptedException {
                    // Wait for all files that need to be transferred to finish transferring
                    latch.await();
                }
            });
            if (corruptedEngine.get() != null) {
                shard.engine().failEngine("recovery", corruptedEngine.get());
                throw corruptedEngine.get();
            } else {
                ExceptionsHelper.rethrowAndSuppress(exceptions);
            }
            cancellableThreads.execute(new Interruptable() {

                @Override
                public void run() throws InterruptedException {
                    // are deleted
                    try {
                        transportService.submitRequest(request.targetNode(), RecoveryTarget.Actions.CLEAN_FILES, new RecoveryCleanFilesRequest(request.recoveryId(), shard.shardId(), recoverySourceMetadata, translogView.totalOperations()), TransportRequestOptions.builder().withTimeout(recoverySettings.internalActionTimeout()).build(), EmptyTransportResponseHandler.INSTANCE_SAME).txGet();
                    } catch (RemoteTransportException remoteException) {
                        final IOException corruptIndexException;
                        //   - maybe due to old segments without checksums or length only checks
                        if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(remoteException)) != null) {
                            try {
                                final Store.MetadataSnapshot recoverySourceMetadata = store.getMetadata(snapshot);
                                StoreFileMetaData[] metadata = Iterables.toArray(recoverySourceMetadata, StoreFileMetaData.class);
                                ArrayUtil.timSort(metadata, new Comparator<StoreFileMetaData>() {

                                    @Override
                                    public int compare(StoreFileMetaData o1, StoreFileMetaData o2) {
                                        // check small files first
                                        return Long.compare(o1.length(), o2.length());
                                    }
                                });
                                for (StoreFileMetaData md : metadata) {
                                    logger.debug("{} checking integrity for file {} after remove corruption exception", shard.shardId(), md);
                                    if (store.checkIntegrityNoException(md) == false) {
                                        // we are corrupted on the primary -- fail!
                                        shard.engine().failEngine("recovery", corruptIndexException);
                                        logger.warn("{} Corrupted file detected {} checksum mismatch", shard.shardId(), md);
                                        throw corruptIndexException;
                                    }
                                }
                            } catch (IOException ex) {
                                remoteException.addSuppressed(ex);
                                throw remoteException;
                            }
                            // corruption has happened on the way to replica
                            RemoteTransportException exception = new RemoteTransportException("File corruption occurred on recovery but checksums are ok", null);
                            exception.addSuppressed(remoteException);
                            logger.warn("{} Remote file corruption during finalization on node {}, recovering {}. local checksum OK", corruptIndexException, shard.shardId(), request.targetNode());
                            throw exception;
                        } else {
                            throw remoteException;
                        }
                    }
                }
            });
        }
        prepareTargetForTranslog(translogView);
        logger.trace("[{}][{}] recovery [phase1] to {}: took [{}]", indexName, shardId, request.targetNode(), stopWatch.totalTime());
        response.phase1Time = stopWatch.totalTime().millis();
    } catch (Throwable e) {
        throw new RecoverFilesRecoveryException(request.shardId(), response.phase1FileNames.size(), new ByteSizeValue(totalSize), e);
    } finally {
        store.decRef();
    }
}
Also used : AbstractRunnable(org.elasticsearch.common.util.concurrent.AbstractRunnable) ByteSizeValue(org.elasticsearch.common.unit.ByteSizeValue) Store(org.elasticsearch.index.store.Store) IndexFormatTooOldException(org.apache.lucene.index.IndexFormatTooOldException) StoreFileMetaData(org.elasticsearch.index.store.StoreFileMetaData) IndexInput(org.apache.lucene.store.IndexInput) TransportRequestOptions(org.elasticsearch.transport.TransportRequestOptions) RemoteTransportException(org.elasticsearch.transport.RemoteTransportException) BytesArray(org.elasticsearch.common.bytes.BytesArray) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) Interruptable(org.elasticsearch.common.util.CancellableThreads.Interruptable) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) CountDownLatch(java.util.concurrent.CountDownLatch) RateLimiter(org.apache.lucene.store.RateLimiter) StopWatch(org.elasticsearch.common.StopWatch) AtomicLong(java.util.concurrent.atomic.AtomicLong) IndexShardClosedException(org.elasticsearch.index.shard.IndexShardClosedException) IndexFormatTooNewException(org.apache.lucene.index.IndexFormatTooNewException) ThreadPoolExecutor(java.util.concurrent.ThreadPoolExecutor) CopyOnWriteArrayList(java.util.concurrent.CopyOnWriteArrayList)

Example 32 with AtomicLong

use of java.util.concurrent.atomic.AtomicLong in project crate by crate.

the class DigestBlob method resumeTransfer.

public static DigestBlob resumeTransfer(BlobContainer blobContainer, String digest, UUID transferId, long currentPos) {
    DigestBlob digestBlob = new DigestBlob(blobContainer, digest, transferId);
    digestBlob.file = getTmpFilePath(blobContainer, digest, transferId).toFile();
    try {
        logger.trace("Resuming DigestBlob {}. CurrentPos {}", digest, currentPos);
        digestBlob.headFileChannel = new FileOutputStream(digestBlob.file, false).getChannel();
        digestBlob.headLength = currentPos;
        digestBlob.headSize = new AtomicLong();
        digestBlob.headCatchedUpLatch = new CountDownLatch(1);
        RandomAccessFile raf = new RandomAccessFile(digestBlob.file, "rw");
        raf.setLength(currentPos);
        raf.close();
        FileOutputStream outputStream = new FileOutputStream(digestBlob.file, true);
        digestBlob.fileChannel = outputStream.getChannel();
    } catch (IOException ex) {
        logger.error("error resuming transfer of {}, id: {}", ex, digest, transferId);
        return null;
    }
    return digestBlob;
}
Also used : AtomicLong(java.util.concurrent.atomic.AtomicLong) CountDownLatch(java.util.concurrent.CountDownLatch)

Example 33 with AtomicLong

use of java.util.concurrent.atomic.AtomicLong in project crate by crate.

the class RecoveryTests method testPrimaryRelocationWhileIndexing.

@Test
public void testPrimaryRelocationWhileIndexing() throws Exception {
    final int numberOfRelocations = 1;
    final int numberOfWriters = 2;
    final String node1 = internalCluster().startNode();
    BlobAdminClient blobAdminClient = internalCluster().getInstance(BlobAdminClient.class, node1);
    logger.trace("--> creating test index ...");
    Settings indexSettings = Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0).put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1).build();
    blobAdminClient.createBlobTable("test", indexSettings).get();
    logger.trace("--> starting [node2] ...");
    final String node2 = internalCluster().startNode();
    ensureGreen();
    final AtomicLong idGenerator = new AtomicLong();
    final AtomicLong indexCounter = new AtomicLong();
    final AtomicBoolean stop = new AtomicBoolean(false);
    Thread[] writers = new Thread[numberOfWriters];
    final CountDownLatch stopLatch = new CountDownLatch(writers.length);
    logger.trace("--> starting {} blob upload threads", writers.length);
    final List<String> uploadedDigests = Collections.synchronizedList(new ArrayList<String>(writers.length));
    for (int i = 0; i < writers.length; i++) {
        final int indexerId = i;
        writers[i] = new Thread() {

            @Override
            public void run() {
                try {
                    logger.trace("**** starting blob upload thread {}", indexerId);
                    while (!stop.get()) {
                        long id = idGenerator.incrementAndGet();
                        String digest = uploadFile(internalCluster().client(node1), genFile(id));
                        uploadedDigests.add(digest);
                        indexCounter.incrementAndGet();
                    }
                    logger.trace("**** done indexing thread {}", indexerId);
                } catch (Exception e) {
                    logger.warn("**** failed indexing thread {}", e, indexerId);
                } finally {
                    stopLatch.countDown();
                }
            }
        };
        writers[i].setName("blob-uploader-thread");
        // dispatch threads from parent, ignoring possible leaking threads
        writers[i].setDaemon(true);
        writers[i].start();
    }
    logger.trace("--> waiting for 2 blobs to be uploaded ...");
    while (uploadedDigests.size() < 2) {
        Thread.sleep(10);
    }
    logger.trace("--> 2 blobs uploaded");
    // increase time between chunks in order to make sure that the upload is taking place while relocating
    timeBetweenChunks.set(10);
    logger.trace("--> starting relocations...");
    for (int i = 0; i < numberOfRelocations; i++) {
        String fromNode = (i % 2 == 0) ? node1 : node2;
        String toNode = node1.equals(fromNode) ? node2 : node1;
        logger.trace("--> START relocate the shard from {} to {}", fromNode, toNode);
        internalCluster().client(node1).admin().cluster().prepareReroute().add(new MoveAllocationCommand(new ShardId(BlobIndex.fullIndexName("test"), 0), fromNode, toNode)).execute().actionGet();
        ClusterHealthResponse clusterHealthResponse = internalCluster().client(node1).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).setWaitForRelocatingShards(0).setTimeout(ACCEPTABLE_RELOCATION_TIME).execute().actionGet();
        assertThat(clusterHealthResponse.isTimedOut(), equalTo(false));
        clusterHealthResponse = internalCluster().client(node2).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).setWaitForRelocatingShards(0).setTimeout(ACCEPTABLE_RELOCATION_TIME).execute().actionGet();
        assertThat(clusterHealthResponse.isTimedOut(), equalTo(false));
        logger.trace("--> DONE relocate the shard from {} to {}", fromNode, toNode);
    }
    logger.trace("--> done relocations");
    logger.trace("--> marking and waiting for upload threads to stop ...");
    timeBetweenChunks.set(0);
    stop.set(true);
    assertThat(stopLatch.await(60, TimeUnit.SECONDS), is(true));
    logger.trace("--> uploading threads stopped");
    logger.trace("--> expected {} got {}", indexCounter.get(), uploadedDigests.size());
    assertEquals(indexCounter.get(), uploadedDigests.size());
    BlobIndicesService blobIndicesService = internalCluster().getInstance(BlobIndicesService.class, node2);
    for (String digest : uploadedDigests) {
        BlobShard blobShard = blobIndicesService.localBlobShard(BlobIndex.fullIndexName("test"), digest);
        long length = blobShard.blobContainer().getFile(digest).length();
        assertThat(length, greaterThanOrEqualTo(1L));
    }
    for (Thread writer : writers) {
        writer.join(6000);
    }
}
Also used : BlobIndicesService(io.crate.blob.v2.BlobIndicesService) BlobAdminClient(io.crate.blob.v2.BlobAdminClient) ClusterHealthResponse(org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse) MoveAllocationCommand(org.elasticsearch.cluster.routing.allocation.command.MoveAllocationCommand) CountDownLatch(java.util.concurrent.CountDownLatch) ShardId(org.elasticsearch.index.shard.ShardId) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) AtomicLong(java.util.concurrent.atomic.AtomicLong) BlobShard(io.crate.blob.v2.BlobShard) Settings(org.elasticsearch.common.settings.Settings) Test(org.junit.Test)

Example 34 with AtomicLong

use of java.util.concurrent.atomic.AtomicLong in project deeplearning4j by deeplearning4j.

the class SequenceVectors method fit.

/**
     * Starts training over
     */
public void fit() {
    Properties props = Nd4j.getExecutioner().getEnvironmentInformation();
    if (props.getProperty("backend").equals("CUDA")) {
        if (Nd4j.getAffinityManager().getNumberOfDevices() > 1)
            throw new IllegalStateException("Multi-GPU word2vec/doc2vec isn't available atm");
    //if (!NativeOpsHolder.getInstance().getDeviceNativeOps().isP2PAvailable())
    //throw new IllegalStateException("Running Word2Vec on multi-gpu system requires P2P support between GPUs, which looks to be unavailable on your system.");
    }
    Nd4j.getRandom().setSeed(configuration.getSeed());
    AtomicLong timeSpent = new AtomicLong(0);
    if (!trainElementsVectors && !trainSequenceVectors)
        throw new IllegalStateException("You should define at least one training goal 'trainElementsRepresentation' or 'trainSequenceRepresentation'");
    if (iterator == null)
        throw new IllegalStateException("You can't fit() data without SequenceIterator defined");
    if (resetModel || (lookupTable != null && vocab != null && vocab.numWords() == 0)) {
        // build vocabulary from scratches
        buildVocab();
    }
    WordVectorSerializer.printOutProjectedMemoryUse(vocab.numWords(), configuration.getLayersSize(), configuration.isUseHierarchicSoftmax() && configuration.getNegative() > 0 ? 3 : 2);
    if (vocab == null || lookupTable == null || vocab.numWords() == 0)
        throw new IllegalStateException("You can't fit() model with empty Vocabulary or WeightLookupTable");
    // if model vocab and lookupTable is built externally we basically should check that lookupTable was properly initialized
    if (!resetModel || existingModel != null) {
        lookupTable.resetWeights(false);
    } else {
        // otherwise we reset weights, independent of actual current state of lookup table
        lookupTable.resetWeights(true);
        // if preciseWeights used, we roll over data once again
        if (configuration.isPreciseWeightInit()) {
            log.info("Using precise weights init...");
            iterator.reset();
            while (iterator.hasMoreSequences()) {
                Sequence<T> sequence = iterator.nextSequence();
                // initializing elements, only once
                for (T element : sequence.getElements()) {
                    T realElement = vocab.tokenFor(element.getLabel());
                    if (realElement != null && !realElement.isInit()) {
                        Random rng = Nd4j.getRandomFactory().getNewRandomInstance(configuration.getSeed() * realElement.hashCode(), configuration.getLayersSize() + 1);
                        INDArray randArray = Nd4j.rand(new int[] { 1, configuration.getLayersSize() }, rng).subi(0.5).divi(configuration.getLayersSize());
                        lookupTable.getWeights().getRow(realElement.getIndex()).assign(randArray);
                        realElement.setInit(true);
                    }
                }
                // initializing labels, only once
                for (T label : sequence.getSequenceLabels()) {
                    T realElement = vocab.tokenFor(label.getLabel());
                    if (realElement != null && !realElement.isInit()) {
                        Random rng = Nd4j.getRandomFactory().getNewRandomInstance(configuration.getSeed() * realElement.hashCode(), configuration.getLayersSize() + 1);
                        INDArray randArray = Nd4j.rand(new int[] { 1, configuration.getLayersSize() }, rng).subi(0.5).divi(configuration.getLayersSize());
                        lookupTable.getWeights().getRow(realElement.getIndex()).assign(randArray);
                        realElement.setInit(true);
                    }
                }
            }
            this.iterator.reset();
        }
    }
    initLearners();
    log.info("Starting learning process...");
    timeSpent.set(System.currentTimeMillis());
    if (this.stopWords == null)
        this.stopWords = new ArrayList<>();
    for (int currentEpoch = 1; currentEpoch <= numEpochs; currentEpoch++) {
        final AtomicLong linesCounter = new AtomicLong(0);
        final AtomicLong wordsCounter = new AtomicLong(0);
        AsyncSequencer sequencer = new AsyncSequencer(this.iterator, this.stopWords);
        sequencer.start();
        //final VectorCalculationsThread[] threads = new VectorCalculationsThread[workers];
        final AtomicLong timer = new AtomicLong(System.currentTimeMillis());
        final List<VectorCalculationsThread> threads = new ArrayList<>();
        for (int x = 0; x < workers; x++) {
            threads.add(x, new VectorCalculationsThread(x, currentEpoch, wordsCounter, vocab.totalWordOccurrences(), linesCounter, sequencer, timer));
            threads.get(x).start();
        }
        try {
            sequencer.join();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
        for (int x = 0; x < workers; x++) {
            try {
                threads.get(x).join();
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        }
        // TODO: fix this to non-exclusive termination
        if (trainElementsVectors && elementsLearningAlgorithm != null && (!trainSequenceVectors || sequenceLearningAlgorithm == null) && elementsLearningAlgorithm.isEarlyTerminationHit()) {
            break;
        }
        if (trainSequenceVectors && sequenceLearningAlgorithm != null && (!trainElementsVectors || elementsLearningAlgorithm == null) && sequenceLearningAlgorithm.isEarlyTerminationHit()) {
            break;
        }
        log.info("Epoch: [" + currentEpoch + "]; Words vectorized so far: [" + wordsCounter.get() + "];  Lines vectorized so far: [" + linesCounter.get() + "]; learningRate: [" + minLearningRate + "]");
        if (eventListeners != null && !eventListeners.isEmpty()) {
            for (VectorsListener listener : eventListeners) {
                if (listener.validateEvent(ListenerEvent.EPOCH, currentEpoch))
                    listener.processEvent(ListenerEvent.EPOCH, this, currentEpoch);
            }
        }
    }
    log.info("Time spent on training: {} ms", System.currentTimeMillis() - timeSpent.get());
}
Also used : AtomicLong(java.util.concurrent.atomic.AtomicLong) Random(org.nd4j.linalg.api.rng.Random) INDArray(org.nd4j.linalg.api.ndarray.INDArray) VectorsListener(org.deeplearning4j.models.sequencevectors.interfaces.VectorsListener)

Example 35 with AtomicLong

use of java.util.concurrent.atomic.AtomicLong in project deeplearning4j by deeplearning4j.

the class VocabConstructor method buildJointVocabulary.

/**
     * This method scans all sources passed through builder, and returns all words as vocab.
     * If TargetVocabCache was set during instance creation, it'll be filled too.
     *
     *
     * @return
     */
public VocabCache<T> buildJointVocabulary(boolean resetCounters, boolean buildHuffmanTree) {
    long lastTime = System.currentTimeMillis();
    long lastSequences = 0;
    long lastElements = 0;
    long startTime = lastTime;
    long startWords = 0;
    AtomicLong parsedCount = new AtomicLong(0);
    if (resetCounters && buildHuffmanTree)
        throw new IllegalStateException("You can't reset counters and build Huffman tree at the same time!");
    if (cache == null)
        cache = new AbstractCache.Builder<T>().build();
    log.debug("Target vocab size before building: [" + cache.numWords() + "]");
    final AtomicLong loopCounter = new AtomicLong(0);
    AbstractCache<T> topHolder = new AbstractCache.Builder<T>().minElementFrequency(0).build();
    int cnt = 0;
    int numProc = Runtime.getRuntime().availableProcessors();
    int numThreads = Math.max(numProc / 2, 2);
    ExecutorService executorService = new ThreadPoolExecutor(numThreads, numThreads, 0L, TimeUnit.MILLISECONDS, new LinkedTransferQueue<Runnable>());
    final AtomicLong execCounter = new AtomicLong(0);
    final AtomicLong finCounter = new AtomicLong(0);
    for (VocabSource<T> source : sources) {
        SequenceIterator<T> iterator = source.getIterator();
        iterator.reset();
        log.debug("Trying source iterator: [" + cnt + "]");
        log.debug("Target vocab size before building: [" + cache.numWords() + "]");
        cnt++;
        AbstractCache<T> tempHolder = new AbstractCache.Builder<T>().build();
        List<Long> timesHasNext = new ArrayList<>();
        List<Long> timesNext = new ArrayList<>();
        int sequences = 0;
        long time3 = 0;
        while (iterator.hasMoreSequences()) {
            Sequence<T> document = iterator.nextSequence();
            seqCount.incrementAndGet();
            parsedCount.addAndGet(document.size());
            tempHolder.incrementTotalDocCount();
            execCounter.incrementAndGet();
            VocabRunnable runnable = new VocabRunnable(tempHolder, document, finCounter, loopCounter);
            executorService.execute(runnable);
            // if we're not in parallel mode - wait till this runnable finishes
            if (!allowParallelBuilder) {
                while (execCounter.get() != finCounter.get()) LockSupport.parkNanos(1000);
            }
            while (execCounter.get() - finCounter.get() > numProc) {
                try {
                    Thread.sleep(1);
                } catch (Exception e) {
                }
            }
            sequences++;
            if (seqCount.get() % 100000 == 0) {
                long currentTime = System.currentTimeMillis();
                long currentSequences = seqCount.get();
                long currentElements = parsedCount.get();
                double seconds = (currentTime - lastTime) / (double) 1000;
                //                    Collections.sort(timesHasNext);
                //                    Collections.sort(timesNext);
                double seqPerSec = (currentSequences - lastSequences) / seconds;
                double elPerSec = (currentElements - lastElements) / seconds;
                //                    log.info("Document time: {} us; hasNext time: {} us", timesNext.get(timesNext.size() / 2), timesHasNext.get(timesHasNext.size() / 2));
                log.info("Sequences checked: [{}]; Current vocabulary size: [{}]; Sequences/sec: {}; Words/sec: {};", seqCount.get(), tempHolder.numWords(), String.format("%.2f", seqPerSec), String.format("%.2f", elPerSec));
                lastTime = currentTime;
                lastElements = currentElements;
                lastSequences = currentSequences;
            //                    timesHasNext.clear();
            //                    timesNext.clear();
            }
            /**
                 * Firing scavenger loop
                 */
            if (enableScavenger && loopCounter.get() >= 2000000 && tempHolder.numWords() > 10000000) {
                log.info("Starting scavenger...");
                while (execCounter.get() != finCounter.get()) {
                    try {
                        Thread.sleep(2);
                    } catch (Exception e) {
                    }
                }
                filterVocab(tempHolder, Math.max(1, source.getMinWordFrequency() / 2));
                loopCounter.set(0);
            }
        //                timesNext.add((time2 - time1) / 1000L);
        //                timesHasNext.add((time1 - time3) / 1000L);
        //                time3 = System.nanoTime();
        }
        // block untill all threads are finished
        log.debug("Wating till all processes stop...");
        while (execCounter.get() != finCounter.get()) {
            try {
                Thread.sleep(2);
            } catch (Exception e) {
            }
        }
        // apply minWordFrequency set for this source
        log.debug("Vocab size before truncation: [" + tempHolder.numWords() + "],  NumWords: [" + tempHolder.totalWordOccurrences() + "], sequences parsed: [" + seqCount.get() + "], counter: [" + parsedCount.get() + "]");
        if (source.getMinWordFrequency() > 0) {
            filterVocab(tempHolder, source.getMinWordFrequency());
        }
        log.debug("Vocab size after truncation: [" + tempHolder.numWords() + "],  NumWords: [" + tempHolder.totalWordOccurrences() + "], sequences parsed: [" + seqCount.get() + "], counter: [" + parsedCount.get() + "]");
        // at this moment we're ready to transfer
        topHolder.importVocabulary(tempHolder);
    }
    // at this moment, we have vocabulary full of words, and we have to reset counters before transfer everything back to VocabCache
    //topHolder.resetWordCounters();
    System.gc();
    System.gc();
    try {
        Thread.sleep(1000);
    } catch (Exception e) {
    //
    }
    cache.importVocabulary(topHolder);
    // adding UNK word
    if (unk != null) {
        log.info("Adding UNK element to vocab...");
        unk.setSpecial(true);
        cache.addToken(unk);
    }
    if (resetCounters) {
        for (T element : cache.vocabWords()) {
            element.setElementFrequency(0);
        }
        cache.updateWordsOccurencies();
    }
    if (buildHuffmanTree) {
        Huffman huffman = new Huffman(cache.vocabWords());
        huffman.build();
        huffman.applyIndexes(cache);
        if (limit > 0) {
            LinkedBlockingQueue<String> labelsToRemove = new LinkedBlockingQueue<>();
            for (T element : cache.vocabWords()) {
                if (element.getIndex() > limit && !element.isSpecial() && !element.isLabel())
                    labelsToRemove.add(element.getLabel());
            }
            for (String label : labelsToRemove) {
                cache.removeElement(label);
            }
        }
    }
    executorService.shutdown();
    System.gc();
    System.gc();
    try {
        Thread.sleep(1000);
    } catch (Exception e) {
    //
    }
    long endSequences = seqCount.get();
    long endTime = System.currentTimeMillis();
    double seconds = (endTime - startTime) / (double) 1000;
    double seqPerSec = endSequences / seconds;
    log.info("Sequences checked: [{}], Current vocabulary size: [{}]; Sequences/sec: [{}];", seqCount.get(), cache.numWords(), String.format("%.2f", seqPerSec));
    return cache;
}
Also used : AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) AtomicLong(java.util.concurrent.atomic.AtomicLong) AtomicLong(java.util.concurrent.atomic.AtomicLong) Huffman(org.deeplearning4j.models.word2vec.Huffman)

Aggregations

AtomicLong (java.util.concurrent.atomic.AtomicLong)678 Test (org.junit.Test)261 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)93 IOException (java.io.IOException)82 CountDownLatch (java.util.concurrent.CountDownLatch)68 ArrayList (java.util.ArrayList)65 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)64 HashMap (java.util.HashMap)47 Map (java.util.Map)45 Random (java.util.Random)43 List (java.util.List)42 AtomicReference (java.util.concurrent.atomic.AtomicReference)40 File (java.io.File)34 ExecutorService (java.util.concurrent.ExecutorService)24 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)23 HashSet (java.util.HashSet)21 Test (org.testng.annotations.Test)21 InetSocketAddress (java.net.InetSocketAddress)16 InputStream (java.io.InputStream)13 Set (java.util.Set)13