use of gov.cms.bfd.pipeline.ccw.rif.extract.s3.S3RifFile in project beneficiary-fhir-data by CMSgov.
the class DataSetSubsetter method downloadDataSet.
/**
* @param options the {@link ExtractionOptions} to use
* @param dataSetS3KeyPrefix the S3 key prefix (i.e. directory) of the data set to download
* @param downloadDirectory the Path to the directory to download the RIF files locally to
* @return the {@link S3RifFile}s that comprise the full 1M beneficiary dummy data set
*/
private static List<RifFile> downloadDataSet(ExtractionOptions options, String dataSetS3KeyPrefix, Path downloadDirectory) {
AmazonS3 s3Client = S3Utilities.createS3Client(options);
TransferManager transferManager = TransferManagerBuilder.standard().withS3Client(s3Client).build();
String dataSetPrefix = "data-random/" + dataSetS3KeyPrefix;
String manifestSuffix = "1_manifest.xml";
Path manifestDownloadPath = downloadDirectory.resolve(manifestSuffix);
if (!Files.exists(manifestDownloadPath)) {
String manifestKey = String.format("%s/%s", dataSetPrefix, manifestSuffix);
Download manifestDownload = transferManager.download(options.getS3BucketName(), manifestKey, manifestDownloadPath.toFile());
try {
manifestDownload.waitForCompletion();
} catch (AmazonClientException | InterruptedException e) {
throw new RuntimeException(e);
}
}
LOGGER.info("Manifest downloaded.");
DataSetManifest dummyDataSetManifest;
try {
JAXBContext jaxbContext = JAXBContext.newInstance(DataSetManifest.class);
Unmarshaller jaxbUnmarshaller = jaxbContext.createUnmarshaller();
dummyDataSetManifest = (DataSetManifest) jaxbUnmarshaller.unmarshal(manifestDownloadPath.toFile());
} catch (JAXBException e) {
throw new UncheckedJaxbException(e);
}
List<RifFile> rifFiles = new ArrayList<>();
for (DataSetManifestEntry manifestEntry : dummyDataSetManifest.getEntries()) {
String dataSetFileKey = String.format("%s/%s", dataSetPrefix, manifestEntry.getName());
Path dataSetFileDownloadPath = downloadDirectory.resolve(manifestEntry.getName());
if (!Files.exists(dataSetFileDownloadPath)) {
LOGGER.info("Downloading RIF file: '{}'...", manifestEntry.getName());
Download dataSetFileDownload = transferManager.download(options.getS3BucketName(), dataSetFileKey, dataSetFileDownloadPath.toFile());
try {
dataSetFileDownload.waitForCompletion();
} catch (AmazonClientException | InterruptedException e) {
throw new RuntimeException(e);
}
}
RifFile dataSetFile = new LocalRifFile(dataSetFileDownloadPath, manifestEntry.getType());
rifFiles.add(dataSetFile);
}
transferManager.shutdownNow();
LOGGER.info("Original RIF files ready.");
return rifFiles;
}
use of gov.cms.bfd.pipeline.ccw.rif.extract.s3.S3RifFile in project beneficiary-fhir-data by CMSgov.
the class CcwRifLoadJob method call.
/**
* @see gov.cms.bfd.pipeline.sharedutils.PipelineJob#call()
*/
@Override
public PipelineJobOutcome call() throws Exception {
LOGGER.debug("Scanning for data sets to process...");
// Update the queue from S3.
dataSetQueue.updatePendingDataSets();
// If no manifest was found, we're done (until next time).
if (dataSetQueue.isEmpty()) {
LOGGER.debug(LOG_MESSAGE_NO_DATA_SETS);
listener.noDataAvailable();
return PipelineJobOutcome.NOTHING_TO_DO;
}
// We've found the oldest manifest.
DataSetManifest manifestToProcess = dataSetQueue.getNextDataSetToProcess().get();
LOGGER.info("Found data set to process: '{}'." + " There were '{}' total pending data sets and '{}' completed ones.", manifestToProcess.toString(), dataSetQueue.getPendingManifestsCount(), dataSetQueue.getCompletedManifestsCount().get());
/*
* We've got a data set to process. However, it might still be uploading
* to S3, so we need to wait for that to complete before we start
* processing it.
*/
boolean alreadyLoggedWaitingEvent = false;
while (!dataSetIsAvailable(manifestToProcess)) {
/*
* We're very patient here, so we keep looping, but it's prudent to
* pause between each iteration. TODO should eventually time out,
* once we know how long transfers might take
*/
try {
if (!alreadyLoggedWaitingEvent) {
LOGGER.info("Data set not ready. Waiting for it to finish uploading...");
alreadyLoggedWaitingEvent = true;
}
Thread.sleep(1000 * 1);
} catch (InterruptedException e) {
/*
* Many Java applications use InterruptedExceptions to signal
* that a thread should stop what it's doing ASAP. This app
* doesn't, so this is unexpected, and accordingly, we don't
* know what to do. Safest bet is to blow up.
*/
throw new RuntimeException(e);
}
}
/*
* Huzzah! We've got a data set to process and we've verified it's all there
* waiting for us in S3. Now convert it into a RifFilesEvent (containing a List
* of asynchronously-downloading S3RifFiles.
*/
LOGGER.info(LOG_MESSAGE_DATA_SET_READY);
List<S3RifFile> rifFiles = manifestToProcess.getEntries().stream().map(manifestEntry -> new S3RifFile(appMetrics, manifestEntry, s3TaskManager.downloadAsync(manifestEntry))).collect(Collectors.toList());
RifFilesEvent rifFilesEvent = new RifFilesEvent(manifestToProcess.getTimestamp(), new ArrayList<>(rifFiles));
/*
* To save time for the next data set, peek ahead at it. If it's available and
* it looks like there's enough disk space, start downloading it early in the
* background.
*/
Optional<DataSetManifest> secondManifestToProcess = dataSetQueue.getSecondDataSetToProcess();
if (secondManifestToProcess.isPresent() && dataSetIsAvailable(secondManifestToProcess.get())) {
Path tmpdir = Paths.get(System.getProperty("java.io.tmpdir"));
long usableFreeTempSpace;
try {
usableFreeTempSpace = Files.getFileStore(tmpdir).getUsableSpace();
} catch (IOException e) {
throw new UncheckedIOException(e);
}
if (usableFreeTempSpace >= (50 * GIGA)) {
secondManifestToProcess.get().getEntries().stream().forEach(manifestEntry -> s3TaskManager.downloadAsync(manifestEntry));
}
}
/*
* Now we hand that off to the DataSetMonitorListener, to do the *real*
* work of actually processing that data set. It's important that we
* block until it's completed, in order to ensure that we don't end up
* processing multiple data sets in parallel (which would lead to data
* consistency problems).
*/
listener.dataAvailable(rifFilesEvent);
LOGGER.info(LOG_MESSAGE_DATA_SET_COMPLETE);
/*
* Now that the data set has been processed, we need to ensure that we
* don't end up processing it again. We ensure this two ways: 1) we keep
* a list of the data sets most recently processed, and 2) we rename the
* S3 objects that comprise that data set. (#1 is required as S3
* deletes/moves are only *eventually* consistent, so #2 may not take
* effect right away.)
*/
rifFiles.stream().forEach(f -> f.cleanupTempFile());
dataSetQueue.markProcessed(manifestToProcess);
s3TaskManager.submit(new DataSetMoveTask(s3TaskManager, options, manifestToProcess));
return PipelineJobOutcome.WORK_DONE;
}
use of gov.cms.bfd.pipeline.ccw.rif.extract.s3.S3RifFile in project beneficiary-fhir-data by CMSgov.
the class S3RifFile method cleanupTempFile.
/**
* Removes the local temporary file that was used to cache this {@link S3RifFile}'s corresponding
* S3 object data locally.
*/
public void cleanupTempFile() {
LOGGER.debug("Cleaning up '{}'...", this);
/*
* We need to either cancel the download or wait for it to complete and then clean up the file.
* However, canceling isn't a thread-safe operation (which is bonkers, but true), so we'll just
* wait for completion.
*/
try {
ManifestEntryDownloadResult fileDownloadResult = waitForDownload();
Files.deleteIfExists(fileDownloadResult.getLocalDownload());
} catch (IOException e) {
throw new UncheckedIOException(e);
} catch (CancellationException e) {
LOGGER.debug("Download was cancelled and can't be cleaned up.");
}
LOGGER.debug("Cleaned up '{}'.", this);
}
Aggregations