use of java.util.concurrent.atomic.AtomicLong in project crate by crate.
the class BlobRecoverySourceHandler method phase1.
/**
* Perform phase1 of the recovery operations. Once this {@link SnapshotIndexCommit}
* snapshot has been performed no commit operations (files being fsync'd)
* are effectively allowed on this index until all recovery phases are done
* <p/>
* Phase1 examines the segment files on the target node and copies over the
* segments that are missing. Only segments that have the same size and
* checksum can be reused
*/
public void phase1(final SnapshotIndexCommit snapshot, final Translog.View translogView) {
cancellableThreads.checkForCancel();
// Total size of segment files that are recovered
long totalSize = 0;
// Total size of segment files that were able to be re-used
long existingTotalSize = 0;
final Store store = shard.store();
store.incRef();
try {
// CRATE CHANGE
if (blobRecoveryHandler != null) {
blobRecoveryHandler.phase1();
}
StopWatch stopWatch = new StopWatch().start();
final Store.MetadataSnapshot recoverySourceMetadata;
try {
recoverySourceMetadata = store.getMetadata(snapshot);
} catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
shard.engine().failEngine("recovery", ex);
throw ex;
}
for (String name : snapshot.getFiles()) {
final StoreFileMetaData md = recoverySourceMetadata.get(name);
if (md == null) {
logger.info("Snapshot differs from actual index for file: {} meta: {}", name, recoverySourceMetadata.asMap());
throw new CorruptIndexException("Snapshot differs from actual index - maybe index was removed metadata has " + recoverySourceMetadata.asMap().size() + " files", name);
}
}
// Generate a "diff" of all the identical, different, and missing
// segment files on the target node, using the existing files on
// the source node
String recoverySourceSyncId = recoverySourceMetadata.getSyncId();
String recoveryTargetSyncId = request.metadataSnapshot().getSyncId();
final boolean recoverWithSyncId = recoverySourceSyncId != null && recoverySourceSyncId.equals(recoveryTargetSyncId);
if (recoverWithSyncId) {
final long numDocsTarget = request.metadataSnapshot().getNumDocs();
final long numDocsSource = recoverySourceMetadata.getNumDocs();
if (numDocsTarget != numDocsSource) {
throw new IllegalStateException("try to recover " + request.shardId() + " from primary shard with sync id but number of docs differ: " + numDocsTarget + " (" + request.sourceNode().getName() + ", primary) vs " + numDocsSource + "(" + request.targetNode().getName() + ")");
}
// we shortcut recovery here because we have nothing to copy. but we must still start the engine on the target.
// so we don't return here
logger.trace("[{}][{}] skipping [phase1] to {} - identical sync id [{}] found on both source and target", indexName, shardId, request.targetNode(), recoverySourceSyncId);
} else {
final Store.RecoveryDiff diff = recoverySourceMetadata.recoveryDiff(request.metadataSnapshot());
for (StoreFileMetaData md : diff.identical) {
response.phase1ExistingFileNames.add(md.name());
response.phase1ExistingFileSizes.add(md.length());
existingTotalSize += md.length();
if (logger.isTraceEnabled()) {
logger.trace("[{}][{}] recovery [phase1] to {}: not recovering [{}], exists in local store and has checksum [{}], size [{}]", indexName, shardId, request.targetNode(), md.name(), md.checksum(), md.length());
}
totalSize += md.length();
}
for (StoreFileMetaData md : Iterables.concat(diff.different, diff.missing)) {
if (request.metadataSnapshot().asMap().containsKey(md.name())) {
logger.trace("[{}][{}] recovery [phase1] to {}: recovering [{}], exists in local store, but is different: remote [{}], local [{}]", indexName, shardId, request.targetNode(), md.name(), request.metadataSnapshot().asMap().get(md.name()), md);
} else {
logger.trace("[{}][{}] recovery [phase1] to {}: recovering [{}], does not exists in remote", indexName, shardId, request.targetNode(), md.name());
}
response.phase1FileNames.add(md.name());
response.phase1FileSizes.add(md.length());
totalSize += md.length();
}
response.phase1TotalSize = totalSize;
response.phase1ExistingTotalSize = existingTotalSize;
logger.trace("[{}][{}] recovery [phase1] to {}: recovering_files [{}] with total_size [{}], reusing_files [{}] with total_size [{}]", indexName, shardId, request.targetNode(), response.phase1FileNames.size(), new ByteSizeValue(totalSize), response.phase1ExistingFileNames.size(), new ByteSizeValue(existingTotalSize));
cancellableThreads.execute(new Interruptable() {
@Override
public void run() throws InterruptedException {
RecoveryFilesInfoRequest recoveryInfoFilesRequest = new RecoveryFilesInfoRequest(request.recoveryId(), request.shardId(), response.phase1FileNames, response.phase1FileSizes, response.phase1ExistingFileNames, response.phase1ExistingFileSizes, translogView.totalOperations());
transportService.submitRequest(request.targetNode(), RecoveryTarget.Actions.FILES_INFO, recoveryInfoFilesRequest, TransportRequestOptions.builder().withTimeout(recoverySettings.internalActionTimeout()).build(), EmptyTransportResponseHandler.INSTANCE_SAME).txGet();
}
});
// This latch will be used to wait until all files have been transferred to the target node
final CountDownLatch latch = new CountDownLatch(response.phase1FileNames.size());
final CopyOnWriteArrayList<Throwable> exceptions = new CopyOnWriteArrayList<>();
final AtomicReference<Throwable> corruptedEngine = new AtomicReference<>();
int fileIndex = 0;
ThreadPoolExecutor pool;
// How many bytes we've copied since we last called RateLimiter.pause
final AtomicLong bytesSinceLastPause = new AtomicLong();
for (final String name : response.phase1FileNames) {
long fileSize = response.phase1FileSizes.get(fileIndex);
// separately.
if (fileSize > RecoverySettings.SMALL_FILE_CUTOFF_BYTES) {
pool = recoverySettings.concurrentStreamPool();
} else {
pool = recoverySettings.concurrentSmallFileStreamPool();
}
pool.execute(new AbstractRunnable() {
@Override
public void onFailure(Throwable t) {
// we either got rejected or the store can't be incremented / we are canceled
logger.debug("Failed to transfer file [" + name + "] on recovery");
}
@Override
public void onAfter() {
// Signify this file has completed by decrementing the latch
latch.countDown();
}
@Override
protected void doRun() {
cancellableThreads.checkForCancel();
store.incRef();
final StoreFileMetaData md = recoverySourceMetadata.get(name);
try (final IndexInput indexInput = store.directory().openInput(name, IOContext.READONCE)) {
// at least one!
final int BUFFER_SIZE = (int) Math.max(1, recoverySettings.fileChunkSize().getBytes());
final byte[] buf = new byte[BUFFER_SIZE];
boolean shouldCompressRequest = recoverySettings.compress();
if (CompressorFactory.isCompressed(indexInput)) {
shouldCompressRequest = false;
}
final long len = indexInput.length();
long readCount = 0;
final TransportRequestOptions requestOptions = TransportRequestOptions.builder().withCompress(shouldCompressRequest).withType(TransportRequestOptions.Type.RECOVERY).withTimeout(recoverySettings.internalActionTimeout()).build();
while (readCount < len) {
if (shard.state() == IndexShardState.CLOSED) {
// check if the shard got closed on us
throw new IndexShardClosedException(shard.shardId());
}
int toRead = readCount + BUFFER_SIZE > len ? (int) (len - readCount) : BUFFER_SIZE;
final long position = indexInput.getFilePointer();
// Pause using the rate limiter, if desired, to throttle the recovery
RateLimiter rl = recoverySettings.rateLimiter();
long throttleTimeInNanos = 0;
if (rl != null) {
long bytes = bytesSinceLastPause.addAndGet(toRead);
if (bytes > rl.getMinPauseCheckBytes()) {
// Time to pause
bytesSinceLastPause.addAndGet(-bytes);
throttleTimeInNanos = rl.pause(bytes);
shard.recoveryStats().addThrottleTime(throttleTimeInNanos);
}
}
indexInput.readBytes(buf, 0, toRead, false);
final BytesArray content = new BytesArray(buf, 0, toRead);
readCount += toRead;
final boolean lastChunk = readCount == len;
final RecoveryFileChunkRequest fileChunkRequest = new RecoveryFileChunkRequest(request.recoveryId(), request.shardId(), md, position, content, lastChunk, translogView.totalOperations(), throttleTimeInNanos);
cancellableThreads.execute(new Interruptable() {
@Override
public void run() throws InterruptedException {
// Actually send the file chunk to the target node, waiting for it to complete
transportService.submitRequest(request.targetNode(), RecoveryTarget.Actions.FILE_CHUNK, fileChunkRequest, requestOptions, EmptyTransportResponseHandler.INSTANCE_SAME).txGet();
}
});
}
} catch (Throwable e) {
final Throwable corruptIndexException;
if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(e)) != null) {
if (store.checkIntegrityNoException(md) == false) {
// we are corrupted on the primary -- fail!
logger.warn("{} Corrupted file detected {} checksum mismatch", shard.shardId(), md);
if (corruptedEngine.compareAndSet(null, corruptIndexException) == false) {
// if we are not the first exception, add ourselves as suppressed to the main one:
corruptedEngine.get().addSuppressed(e);
}
} else {
// corruption has happened on the way to replica
RemoteTransportException exception = new RemoteTransportException("File corruption occurred on recovery but checksums are ok", null);
exception.addSuppressed(e);
// last exception first
exceptions.add(0, exception);
logger.warn("{} Remote file corruption on node {}, recovering {}. local checksum OK", corruptIndexException, shard.shardId(), request.targetNode(), md);
}
} else {
// last exceptions first
exceptions.add(0, e);
}
} finally {
store.decRef();
}
}
});
fileIndex++;
}
cancellableThreads.execute(new Interruptable() {
@Override
public void run() throws InterruptedException {
// Wait for all files that need to be transferred to finish transferring
latch.await();
}
});
if (corruptedEngine.get() != null) {
shard.engine().failEngine("recovery", corruptedEngine.get());
throw corruptedEngine.get();
} else {
ExceptionsHelper.rethrowAndSuppress(exceptions);
}
cancellableThreads.execute(new Interruptable() {
@Override
public void run() throws InterruptedException {
// are deleted
try {
transportService.submitRequest(request.targetNode(), RecoveryTarget.Actions.CLEAN_FILES, new RecoveryCleanFilesRequest(request.recoveryId(), shard.shardId(), recoverySourceMetadata, translogView.totalOperations()), TransportRequestOptions.builder().withTimeout(recoverySettings.internalActionTimeout()).build(), EmptyTransportResponseHandler.INSTANCE_SAME).txGet();
} catch (RemoteTransportException remoteException) {
final IOException corruptIndexException;
// - maybe due to old segments without checksums or length only checks
if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(remoteException)) != null) {
try {
final Store.MetadataSnapshot recoverySourceMetadata = store.getMetadata(snapshot);
StoreFileMetaData[] metadata = Iterables.toArray(recoverySourceMetadata, StoreFileMetaData.class);
ArrayUtil.timSort(metadata, new Comparator<StoreFileMetaData>() {
@Override
public int compare(StoreFileMetaData o1, StoreFileMetaData o2) {
// check small files first
return Long.compare(o1.length(), o2.length());
}
});
for (StoreFileMetaData md : metadata) {
logger.debug("{} checking integrity for file {} after remove corruption exception", shard.shardId(), md);
if (store.checkIntegrityNoException(md) == false) {
// we are corrupted on the primary -- fail!
shard.engine().failEngine("recovery", corruptIndexException);
logger.warn("{} Corrupted file detected {} checksum mismatch", shard.shardId(), md);
throw corruptIndexException;
}
}
} catch (IOException ex) {
remoteException.addSuppressed(ex);
throw remoteException;
}
// corruption has happened on the way to replica
RemoteTransportException exception = new RemoteTransportException("File corruption occurred on recovery but checksums are ok", null);
exception.addSuppressed(remoteException);
logger.warn("{} Remote file corruption during finalization on node {}, recovering {}. local checksum OK", corruptIndexException, shard.shardId(), request.targetNode());
throw exception;
} else {
throw remoteException;
}
}
}
});
}
prepareTargetForTranslog(translogView);
logger.trace("[{}][{}] recovery [phase1] to {}: took [{}]", indexName, shardId, request.targetNode(), stopWatch.totalTime());
response.phase1Time = stopWatch.totalTime().millis();
} catch (Throwable e) {
throw new RecoverFilesRecoveryException(request.shardId(), response.phase1FileNames.size(), new ByteSizeValue(totalSize), e);
} finally {
store.decRef();
}
}
use of java.util.concurrent.atomic.AtomicLong in project crate by crate.
the class DigestBlob method resumeTransfer.
public static DigestBlob resumeTransfer(BlobContainer blobContainer, String digest, UUID transferId, long currentPos) {
DigestBlob digestBlob = new DigestBlob(blobContainer, digest, transferId);
digestBlob.file = getTmpFilePath(blobContainer, digest, transferId).toFile();
try {
logger.trace("Resuming DigestBlob {}. CurrentPos {}", digest, currentPos);
digestBlob.headFileChannel = new FileOutputStream(digestBlob.file, false).getChannel();
digestBlob.headLength = currentPos;
digestBlob.headSize = new AtomicLong();
digestBlob.headCatchedUpLatch = new CountDownLatch(1);
RandomAccessFile raf = new RandomAccessFile(digestBlob.file, "rw");
raf.setLength(currentPos);
raf.close();
FileOutputStream outputStream = new FileOutputStream(digestBlob.file, true);
digestBlob.fileChannel = outputStream.getChannel();
} catch (IOException ex) {
logger.error("error resuming transfer of {}, id: {}", ex, digest, transferId);
return null;
}
return digestBlob;
}
use of java.util.concurrent.atomic.AtomicLong in project crate by crate.
the class RecoveryTests method testPrimaryRelocationWhileIndexing.
@Test
public void testPrimaryRelocationWhileIndexing() throws Exception {
final int numberOfRelocations = 1;
final int numberOfWriters = 2;
final String node1 = internalCluster().startNode();
BlobAdminClient blobAdminClient = internalCluster().getInstance(BlobAdminClient.class, node1);
logger.trace("--> creating test index ...");
Settings indexSettings = Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0).put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1).build();
blobAdminClient.createBlobTable("test", indexSettings).get();
logger.trace("--> starting [node2] ...");
final String node2 = internalCluster().startNode();
ensureGreen();
final AtomicLong idGenerator = new AtomicLong();
final AtomicLong indexCounter = new AtomicLong();
final AtomicBoolean stop = new AtomicBoolean(false);
Thread[] writers = new Thread[numberOfWriters];
final CountDownLatch stopLatch = new CountDownLatch(writers.length);
logger.trace("--> starting {} blob upload threads", writers.length);
final List<String> uploadedDigests = Collections.synchronizedList(new ArrayList<String>(writers.length));
for (int i = 0; i < writers.length; i++) {
final int indexerId = i;
writers[i] = new Thread() {
@Override
public void run() {
try {
logger.trace("**** starting blob upload thread {}", indexerId);
while (!stop.get()) {
long id = idGenerator.incrementAndGet();
String digest = uploadFile(internalCluster().client(node1), genFile(id));
uploadedDigests.add(digest);
indexCounter.incrementAndGet();
}
logger.trace("**** done indexing thread {}", indexerId);
} catch (Exception e) {
logger.warn("**** failed indexing thread {}", e, indexerId);
} finally {
stopLatch.countDown();
}
}
};
writers[i].setName("blob-uploader-thread");
// dispatch threads from parent, ignoring possible leaking threads
writers[i].setDaemon(true);
writers[i].start();
}
logger.trace("--> waiting for 2 blobs to be uploaded ...");
while (uploadedDigests.size() < 2) {
Thread.sleep(10);
}
logger.trace("--> 2 blobs uploaded");
// increase time between chunks in order to make sure that the upload is taking place while relocating
timeBetweenChunks.set(10);
logger.trace("--> starting relocations...");
for (int i = 0; i < numberOfRelocations; i++) {
String fromNode = (i % 2 == 0) ? node1 : node2;
String toNode = node1.equals(fromNode) ? node2 : node1;
logger.trace("--> START relocate the shard from {} to {}", fromNode, toNode);
internalCluster().client(node1).admin().cluster().prepareReroute().add(new MoveAllocationCommand(new ShardId(BlobIndex.fullIndexName("test"), 0), fromNode, toNode)).execute().actionGet();
ClusterHealthResponse clusterHealthResponse = internalCluster().client(node1).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).setWaitForRelocatingShards(0).setTimeout(ACCEPTABLE_RELOCATION_TIME).execute().actionGet();
assertThat(clusterHealthResponse.isTimedOut(), equalTo(false));
clusterHealthResponse = internalCluster().client(node2).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).setWaitForRelocatingShards(0).setTimeout(ACCEPTABLE_RELOCATION_TIME).execute().actionGet();
assertThat(clusterHealthResponse.isTimedOut(), equalTo(false));
logger.trace("--> DONE relocate the shard from {} to {}", fromNode, toNode);
}
logger.trace("--> done relocations");
logger.trace("--> marking and waiting for upload threads to stop ...");
timeBetweenChunks.set(0);
stop.set(true);
assertThat(stopLatch.await(60, TimeUnit.SECONDS), is(true));
logger.trace("--> uploading threads stopped");
logger.trace("--> expected {} got {}", indexCounter.get(), uploadedDigests.size());
assertEquals(indexCounter.get(), uploadedDigests.size());
BlobIndicesService blobIndicesService = internalCluster().getInstance(BlobIndicesService.class, node2);
for (String digest : uploadedDigests) {
BlobShard blobShard = blobIndicesService.localBlobShard(BlobIndex.fullIndexName("test"), digest);
long length = blobShard.blobContainer().getFile(digest).length();
assertThat(length, greaterThanOrEqualTo(1L));
}
for (Thread writer : writers) {
writer.join(6000);
}
}
use of java.util.concurrent.atomic.AtomicLong in project deeplearning4j by deeplearning4j.
the class SequenceVectors method fit.
/**
* Starts training over
*/
public void fit() {
Properties props = Nd4j.getExecutioner().getEnvironmentInformation();
if (props.getProperty("backend").equals("CUDA")) {
if (Nd4j.getAffinityManager().getNumberOfDevices() > 1)
throw new IllegalStateException("Multi-GPU word2vec/doc2vec isn't available atm");
//if (!NativeOpsHolder.getInstance().getDeviceNativeOps().isP2PAvailable())
//throw new IllegalStateException("Running Word2Vec on multi-gpu system requires P2P support between GPUs, which looks to be unavailable on your system.");
}
Nd4j.getRandom().setSeed(configuration.getSeed());
AtomicLong timeSpent = new AtomicLong(0);
if (!trainElementsVectors && !trainSequenceVectors)
throw new IllegalStateException("You should define at least one training goal 'trainElementsRepresentation' or 'trainSequenceRepresentation'");
if (iterator == null)
throw new IllegalStateException("You can't fit() data without SequenceIterator defined");
if (resetModel || (lookupTable != null && vocab != null && vocab.numWords() == 0)) {
// build vocabulary from scratches
buildVocab();
}
WordVectorSerializer.printOutProjectedMemoryUse(vocab.numWords(), configuration.getLayersSize(), configuration.isUseHierarchicSoftmax() && configuration.getNegative() > 0 ? 3 : 2);
if (vocab == null || lookupTable == null || vocab.numWords() == 0)
throw new IllegalStateException("You can't fit() model with empty Vocabulary or WeightLookupTable");
// if model vocab and lookupTable is built externally we basically should check that lookupTable was properly initialized
if (!resetModel || existingModel != null) {
lookupTable.resetWeights(false);
} else {
// otherwise we reset weights, independent of actual current state of lookup table
lookupTable.resetWeights(true);
// if preciseWeights used, we roll over data once again
if (configuration.isPreciseWeightInit()) {
log.info("Using precise weights init...");
iterator.reset();
while (iterator.hasMoreSequences()) {
Sequence<T> sequence = iterator.nextSequence();
// initializing elements, only once
for (T element : sequence.getElements()) {
T realElement = vocab.tokenFor(element.getLabel());
if (realElement != null && !realElement.isInit()) {
Random rng = Nd4j.getRandomFactory().getNewRandomInstance(configuration.getSeed() * realElement.hashCode(), configuration.getLayersSize() + 1);
INDArray randArray = Nd4j.rand(new int[] { 1, configuration.getLayersSize() }, rng).subi(0.5).divi(configuration.getLayersSize());
lookupTable.getWeights().getRow(realElement.getIndex()).assign(randArray);
realElement.setInit(true);
}
}
// initializing labels, only once
for (T label : sequence.getSequenceLabels()) {
T realElement = vocab.tokenFor(label.getLabel());
if (realElement != null && !realElement.isInit()) {
Random rng = Nd4j.getRandomFactory().getNewRandomInstance(configuration.getSeed() * realElement.hashCode(), configuration.getLayersSize() + 1);
INDArray randArray = Nd4j.rand(new int[] { 1, configuration.getLayersSize() }, rng).subi(0.5).divi(configuration.getLayersSize());
lookupTable.getWeights().getRow(realElement.getIndex()).assign(randArray);
realElement.setInit(true);
}
}
}
this.iterator.reset();
}
}
initLearners();
log.info("Starting learning process...");
timeSpent.set(System.currentTimeMillis());
if (this.stopWords == null)
this.stopWords = new ArrayList<>();
for (int currentEpoch = 1; currentEpoch <= numEpochs; currentEpoch++) {
final AtomicLong linesCounter = new AtomicLong(0);
final AtomicLong wordsCounter = new AtomicLong(0);
AsyncSequencer sequencer = new AsyncSequencer(this.iterator, this.stopWords);
sequencer.start();
//final VectorCalculationsThread[] threads = new VectorCalculationsThread[workers];
final AtomicLong timer = new AtomicLong(System.currentTimeMillis());
final List<VectorCalculationsThread> threads = new ArrayList<>();
for (int x = 0; x < workers; x++) {
threads.add(x, new VectorCalculationsThread(x, currentEpoch, wordsCounter, vocab.totalWordOccurrences(), linesCounter, sequencer, timer));
threads.get(x).start();
}
try {
sequencer.join();
} catch (Exception e) {
throw new RuntimeException(e);
}
for (int x = 0; x < workers; x++) {
try {
threads.get(x).join();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
// TODO: fix this to non-exclusive termination
if (trainElementsVectors && elementsLearningAlgorithm != null && (!trainSequenceVectors || sequenceLearningAlgorithm == null) && elementsLearningAlgorithm.isEarlyTerminationHit()) {
break;
}
if (trainSequenceVectors && sequenceLearningAlgorithm != null && (!trainElementsVectors || elementsLearningAlgorithm == null) && sequenceLearningAlgorithm.isEarlyTerminationHit()) {
break;
}
log.info("Epoch: [" + currentEpoch + "]; Words vectorized so far: [" + wordsCounter.get() + "]; Lines vectorized so far: [" + linesCounter.get() + "]; learningRate: [" + minLearningRate + "]");
if (eventListeners != null && !eventListeners.isEmpty()) {
for (VectorsListener listener : eventListeners) {
if (listener.validateEvent(ListenerEvent.EPOCH, currentEpoch))
listener.processEvent(ListenerEvent.EPOCH, this, currentEpoch);
}
}
}
log.info("Time spent on training: {} ms", System.currentTimeMillis() - timeSpent.get());
}
use of java.util.concurrent.atomic.AtomicLong in project deeplearning4j by deeplearning4j.
the class VocabConstructor method buildJointVocabulary.
/**
* This method scans all sources passed through builder, and returns all words as vocab.
* If TargetVocabCache was set during instance creation, it'll be filled too.
*
*
* @return
*/
public VocabCache<T> buildJointVocabulary(boolean resetCounters, boolean buildHuffmanTree) {
long lastTime = System.currentTimeMillis();
long lastSequences = 0;
long lastElements = 0;
long startTime = lastTime;
long startWords = 0;
AtomicLong parsedCount = new AtomicLong(0);
if (resetCounters && buildHuffmanTree)
throw new IllegalStateException("You can't reset counters and build Huffman tree at the same time!");
if (cache == null)
cache = new AbstractCache.Builder<T>().build();
log.debug("Target vocab size before building: [" + cache.numWords() + "]");
final AtomicLong loopCounter = new AtomicLong(0);
AbstractCache<T> topHolder = new AbstractCache.Builder<T>().minElementFrequency(0).build();
int cnt = 0;
int numProc = Runtime.getRuntime().availableProcessors();
int numThreads = Math.max(numProc / 2, 2);
ExecutorService executorService = new ThreadPoolExecutor(numThreads, numThreads, 0L, TimeUnit.MILLISECONDS, new LinkedTransferQueue<Runnable>());
final AtomicLong execCounter = new AtomicLong(0);
final AtomicLong finCounter = new AtomicLong(0);
for (VocabSource<T> source : sources) {
SequenceIterator<T> iterator = source.getIterator();
iterator.reset();
log.debug("Trying source iterator: [" + cnt + "]");
log.debug("Target vocab size before building: [" + cache.numWords() + "]");
cnt++;
AbstractCache<T> tempHolder = new AbstractCache.Builder<T>().build();
List<Long> timesHasNext = new ArrayList<>();
List<Long> timesNext = new ArrayList<>();
int sequences = 0;
long time3 = 0;
while (iterator.hasMoreSequences()) {
Sequence<T> document = iterator.nextSequence();
seqCount.incrementAndGet();
parsedCount.addAndGet(document.size());
tempHolder.incrementTotalDocCount();
execCounter.incrementAndGet();
VocabRunnable runnable = new VocabRunnable(tempHolder, document, finCounter, loopCounter);
executorService.execute(runnable);
// if we're not in parallel mode - wait till this runnable finishes
if (!allowParallelBuilder) {
while (execCounter.get() != finCounter.get()) LockSupport.parkNanos(1000);
}
while (execCounter.get() - finCounter.get() > numProc) {
try {
Thread.sleep(1);
} catch (Exception e) {
}
}
sequences++;
if (seqCount.get() % 100000 == 0) {
long currentTime = System.currentTimeMillis();
long currentSequences = seqCount.get();
long currentElements = parsedCount.get();
double seconds = (currentTime - lastTime) / (double) 1000;
// Collections.sort(timesHasNext);
// Collections.sort(timesNext);
double seqPerSec = (currentSequences - lastSequences) / seconds;
double elPerSec = (currentElements - lastElements) / seconds;
// log.info("Document time: {} us; hasNext time: {} us", timesNext.get(timesNext.size() / 2), timesHasNext.get(timesHasNext.size() / 2));
log.info("Sequences checked: [{}]; Current vocabulary size: [{}]; Sequences/sec: {}; Words/sec: {};", seqCount.get(), tempHolder.numWords(), String.format("%.2f", seqPerSec), String.format("%.2f", elPerSec));
lastTime = currentTime;
lastElements = currentElements;
lastSequences = currentSequences;
// timesHasNext.clear();
// timesNext.clear();
}
/**
* Firing scavenger loop
*/
if (enableScavenger && loopCounter.get() >= 2000000 && tempHolder.numWords() > 10000000) {
log.info("Starting scavenger...");
while (execCounter.get() != finCounter.get()) {
try {
Thread.sleep(2);
} catch (Exception e) {
}
}
filterVocab(tempHolder, Math.max(1, source.getMinWordFrequency() / 2));
loopCounter.set(0);
}
// timesNext.add((time2 - time1) / 1000L);
// timesHasNext.add((time1 - time3) / 1000L);
// time3 = System.nanoTime();
}
// block untill all threads are finished
log.debug("Wating till all processes stop...");
while (execCounter.get() != finCounter.get()) {
try {
Thread.sleep(2);
} catch (Exception e) {
}
}
// apply minWordFrequency set for this source
log.debug("Vocab size before truncation: [" + tempHolder.numWords() + "], NumWords: [" + tempHolder.totalWordOccurrences() + "], sequences parsed: [" + seqCount.get() + "], counter: [" + parsedCount.get() + "]");
if (source.getMinWordFrequency() > 0) {
filterVocab(tempHolder, source.getMinWordFrequency());
}
log.debug("Vocab size after truncation: [" + tempHolder.numWords() + "], NumWords: [" + tempHolder.totalWordOccurrences() + "], sequences parsed: [" + seqCount.get() + "], counter: [" + parsedCount.get() + "]");
// at this moment we're ready to transfer
topHolder.importVocabulary(tempHolder);
}
// at this moment, we have vocabulary full of words, and we have to reset counters before transfer everything back to VocabCache
//topHolder.resetWordCounters();
System.gc();
System.gc();
try {
Thread.sleep(1000);
} catch (Exception e) {
//
}
cache.importVocabulary(topHolder);
// adding UNK word
if (unk != null) {
log.info("Adding UNK element to vocab...");
unk.setSpecial(true);
cache.addToken(unk);
}
if (resetCounters) {
for (T element : cache.vocabWords()) {
element.setElementFrequency(0);
}
cache.updateWordsOccurencies();
}
if (buildHuffmanTree) {
Huffman huffman = new Huffman(cache.vocabWords());
huffman.build();
huffman.applyIndexes(cache);
if (limit > 0) {
LinkedBlockingQueue<String> labelsToRemove = new LinkedBlockingQueue<>();
for (T element : cache.vocabWords()) {
if (element.getIndex() > limit && !element.isSpecial() && !element.isLabel())
labelsToRemove.add(element.getLabel());
}
for (String label : labelsToRemove) {
cache.removeElement(label);
}
}
}
executorService.shutdown();
System.gc();
System.gc();
try {
Thread.sleep(1000);
} catch (Exception e) {
//
}
long endSequences = seqCount.get();
long endTime = System.currentTimeMillis();
double seconds = (endTime - startTime) / (double) 1000;
double seqPerSec = endSequences / seconds;
log.info("Sequences checked: [{}], Current vocabulary size: [{}]; Sequences/sec: [{}];", seqCount.get(), cache.numWords(), String.format("%.2f", seqPerSec));
return cache;
}
Aggregations