use of gov.cms.bfd.pipeline.ccw.rif.extract.s3.task.S3TaskManager in project beneficiary-fhir-data by CMSgov.
the class CcwRifLoadJobIT method multipleDataSetsTest.
/**
* Tests {@link CcwRifLoadJob} when run against an empty bucket.
*
* @throws Exception (exceptions indicate test failure)
*/
@Test
public void multipleDataSetsTest() throws Exception {
AmazonS3 s3Client = S3Utilities.createS3Client(new ExtractionOptions("foo"));
Bucket bucket = null;
try {
/*
* Create the (empty) bucket to run against, and populate it with
* two data sets.
*/
bucket = DataSetTestUtilities.createTestBucket(s3Client);
ExtractionOptions options = new ExtractionOptions(bucket.getName(), Optional.empty(), Optional.of(1));
LOGGER.info("Bucket created: '{}:{}'", s3Client.getS3AccountOwner().getDisplayName(), bucket.getName());
DataSetManifest manifestA = new DataSetManifest(Instant.now().minus(1L, ChronoUnit.HOURS), 0, new DataSetManifestEntry("beneficiaries.rif", RifFileType.BENEFICIARY));
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestA));
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestA, manifestA.getEntries().get(0), StaticRifResource.SAMPLE_A_BENES.getResourceUrl()));
DataSetManifest manifestB = new DataSetManifest(manifestA.getTimestampText(), 1, new DataSetManifestEntry("pde.rif", RifFileType.PDE));
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestB));
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestB, manifestB.getEntries().get(0), StaticRifResource.SAMPLE_A_BENES.getResourceUrl()));
DataSetManifest manifestC = new DataSetManifest(Instant.now(), 0, new DataSetManifestEntry("carrier.rif", RifFileType.CARRIER));
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestC));
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestC, manifestC.getEntries().get(0), StaticRifResource.SAMPLE_A_CARRIER.getResourceUrl()));
// Run the job.
MockDataSetMonitorListener listener = new MockDataSetMonitorListener();
S3TaskManager s3TaskManager = new S3TaskManager(PipelineTestUtils.get().getPipelineApplicationState().getMetrics(), options);
CcwRifLoadJob ccwJob = new CcwRifLoadJob(PipelineTestUtils.get().getPipelineApplicationState().getMetrics(), options, s3TaskManager, listener);
ccwJob.call();
// Verify what was handed off to the DataSetMonitorListener.
assertEquals(0, listener.getNoDataAvailableEvents());
assertEquals(1, listener.getDataEvents().size());
assertEquals(manifestA.getTimestamp(), listener.getDataEvents().get(0).getTimestamp());
assertEquals(manifestA.getEntries().size(), listener.getDataEvents().get(0).getFileEvents().size());
assertEquals(0, listener.getErrorEvents().size());
/*
* Verify that the first data set was renamed and the second is
* still there.
*/
DataSetTestUtilities.waitForBucketObjectCount(s3Client, bucket, CcwRifLoadJob.S3_PREFIX_PENDING_DATA_SETS, 1 + manifestB.getEntries().size() + 1 + manifestC.getEntries().size(), java.time.Duration.ofSeconds(10));
DataSetTestUtilities.waitForBucketObjectCount(s3Client, bucket, CcwRifLoadJob.S3_PREFIX_COMPLETED_DATA_SETS, 1 + manifestA.getEntries().size(), java.time.Duration.ofSeconds(10));
} finally {
if (bucket != null)
DataSetTestUtilities.deleteObjectsAndBucket(s3Client, bucket);
}
}
use of gov.cms.bfd.pipeline.ccw.rif.extract.s3.task.S3TaskManager in project beneficiary-fhir-data by CMSgov.
the class CcwRifLoadJobIT method singleDataSetTest.
/**
* Tests {@link CcwRifLoadJob} when run against a bucket with a single data set.
*
* @throws Exception (exceptions indicate test failure)
*/
@Test
public void singleDataSetTest() throws Exception {
AmazonS3 s3Client = S3Utilities.createS3Client(new ExtractionOptions("foo"));
Bucket bucket = null;
try {
/*
* Create the (empty) bucket to run against, and populate it with a
* data set.
*/
bucket = DataSetTestUtilities.createTestBucket(s3Client);
ExtractionOptions options = new ExtractionOptions(bucket.getName());
LOGGER.info("Bucket created: '{}:{}'", s3Client.getS3AccountOwner().getDisplayName(), bucket.getName());
DataSetManifest manifest = new DataSetManifest(Instant.now(), 0, new DataSetManifestEntry("beneficiaries.rif", RifFileType.BENEFICIARY), new DataSetManifestEntry("carrier.rif", RifFileType.CARRIER));
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifest));
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifest, manifest.getEntries().get(0), StaticRifResource.SAMPLE_A_BENES.getResourceUrl()));
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifest, manifest.getEntries().get(1), StaticRifResource.SAMPLE_A_CARRIER.getResourceUrl()));
// Run the job.
MockDataSetMonitorListener listener = new MockDataSetMonitorListener();
S3TaskManager s3TaskManager = new S3TaskManager(PipelineTestUtils.get().getPipelineApplicationState().getMetrics(), options);
CcwRifLoadJob ccwJob = new CcwRifLoadJob(PipelineTestUtils.get().getPipelineApplicationState().getMetrics(), options, s3TaskManager, listener);
ccwJob.call();
// Verify what was handed off to the DataSetMonitorListener.
assertEquals(0, listener.getNoDataAvailableEvents());
assertEquals(1, listener.getDataEvents().size());
assertEquals(manifest.getTimestamp(), listener.getDataEvents().get(0).getTimestamp());
assertEquals(manifest.getEntries().size(), listener.getDataEvents().get(0).getFileEvents().size());
assertEquals(0, listener.getErrorEvents().size());
// Verify that the data set was renamed.
DataSetTestUtilities.waitForBucketObjectCount(s3Client, bucket, CcwRifLoadJob.S3_PREFIX_PENDING_DATA_SETS, 0, java.time.Duration.ofSeconds(10));
DataSetTestUtilities.waitForBucketObjectCount(s3Client, bucket, CcwRifLoadJob.S3_PREFIX_COMPLETED_DATA_SETS, 1 + manifest.getEntries().size(), java.time.Duration.ofSeconds(10));
} finally {
if (bucket != null)
DataSetTestUtilities.deleteObjectsAndBucket(s3Client, bucket);
}
}
use of gov.cms.bfd.pipeline.ccw.rif.extract.s3.task.S3TaskManager in project beneficiary-fhir-data by CMSgov.
the class CcwRifLoadJob method call.
/**
* @see gov.cms.bfd.pipeline.sharedutils.PipelineJob#call()
*/
@Override
public PipelineJobOutcome call() throws Exception {
LOGGER.debug("Scanning for data sets to process...");
// Update the queue from S3.
dataSetQueue.updatePendingDataSets();
// If no manifest was found, we're done (until next time).
if (dataSetQueue.isEmpty()) {
LOGGER.debug(LOG_MESSAGE_NO_DATA_SETS);
listener.noDataAvailable();
return PipelineJobOutcome.NOTHING_TO_DO;
}
// We've found the oldest manifest.
DataSetManifest manifestToProcess = dataSetQueue.getNextDataSetToProcess().get();
LOGGER.info("Found data set to process: '{}'." + " There were '{}' total pending data sets and '{}' completed ones.", manifestToProcess.toString(), dataSetQueue.getPendingManifestsCount(), dataSetQueue.getCompletedManifestsCount().get());
/*
* We've got a data set to process. However, it might still be uploading
* to S3, so we need to wait for that to complete before we start
* processing it.
*/
boolean alreadyLoggedWaitingEvent = false;
while (!dataSetIsAvailable(manifestToProcess)) {
/*
* We're very patient here, so we keep looping, but it's prudent to
* pause between each iteration. TODO should eventually time out,
* once we know how long transfers might take
*/
try {
if (!alreadyLoggedWaitingEvent) {
LOGGER.info("Data set not ready. Waiting for it to finish uploading...");
alreadyLoggedWaitingEvent = true;
}
Thread.sleep(1000 * 1);
} catch (InterruptedException e) {
/*
* Many Java applications use InterruptedExceptions to signal
* that a thread should stop what it's doing ASAP. This app
* doesn't, so this is unexpected, and accordingly, we don't
* know what to do. Safest bet is to blow up.
*/
throw new RuntimeException(e);
}
}
/*
* Huzzah! We've got a data set to process and we've verified it's all there
* waiting for us in S3. Now convert it into a RifFilesEvent (containing a List
* of asynchronously-downloading S3RifFiles.
*/
LOGGER.info(LOG_MESSAGE_DATA_SET_READY);
List<S3RifFile> rifFiles = manifestToProcess.getEntries().stream().map(manifestEntry -> new S3RifFile(appMetrics, manifestEntry, s3TaskManager.downloadAsync(manifestEntry))).collect(Collectors.toList());
RifFilesEvent rifFilesEvent = new RifFilesEvent(manifestToProcess.getTimestamp(), new ArrayList<>(rifFiles));
/*
* To save time for the next data set, peek ahead at it. If it's available and
* it looks like there's enough disk space, start downloading it early in the
* background.
*/
Optional<DataSetManifest> secondManifestToProcess = dataSetQueue.getSecondDataSetToProcess();
if (secondManifestToProcess.isPresent() && dataSetIsAvailable(secondManifestToProcess.get())) {
Path tmpdir = Paths.get(System.getProperty("java.io.tmpdir"));
long usableFreeTempSpace;
try {
usableFreeTempSpace = Files.getFileStore(tmpdir).getUsableSpace();
} catch (IOException e) {
throw new UncheckedIOException(e);
}
if (usableFreeTempSpace >= (50 * GIGA)) {
secondManifestToProcess.get().getEntries().stream().forEach(manifestEntry -> s3TaskManager.downloadAsync(manifestEntry));
}
}
/*
* Now we hand that off to the DataSetMonitorListener, to do the *real*
* work of actually processing that data set. It's important that we
* block until it's completed, in order to ensure that we don't end up
* processing multiple data sets in parallel (which would lead to data
* consistency problems).
*/
listener.dataAvailable(rifFilesEvent);
LOGGER.info(LOG_MESSAGE_DATA_SET_COMPLETE);
/*
* Now that the data set has been processed, we need to ensure that we
* don't end up processing it again. We ensure this two ways: 1) we keep
* a list of the data sets most recently processed, and 2) we rename the
* S3 objects that comprise that data set. (#1 is required as S3
* deletes/moves are only *eventually* consistent, so #2 may not take
* effect right away.)
*/
rifFiles.stream().forEach(f -> f.cleanupTempFile());
dataSetQueue.markProcessed(manifestToProcess);
s3TaskManager.submit(new DataSetMoveTask(s3TaskManager, options, manifestToProcess));
return PipelineJobOutcome.WORK_DONE;
}
use of gov.cms.bfd.pipeline.ccw.rif.extract.s3.task.S3TaskManager in project beneficiary-fhir-data by CMSgov.
the class PipelineApplication method createCcwRifLoadJob.
/**
* @param loadOptions the {@link CcwRifLoadOptions} to use
* @param appState the {@link PipelineApplicationState} to use
* @return a {@link CcwRifLoadJob} instance for the application to use
*/
private static PipelineJob<?> createCcwRifLoadJob(CcwRifLoadOptions loadOptions, PipelineApplicationState appState) {
/*
* Create the services that will be used to handle each stage in the extract, transform, and
* load process.
*/
S3TaskManager s3TaskManager = new S3TaskManager(appState.getMetrics(), loadOptions.getExtractionOptions());
RifFilesProcessor rifProcessor = new RifFilesProcessor();
RifLoader rifLoader = new RifLoader(loadOptions.getLoadOptions(), appState);
/*
* Create the DataSetMonitorListener that will glue those stages together and run them all for
* each data set that is found.
*/
DataSetMonitorListener dataSetMonitorListener = new DefaultDataSetMonitorListener(appState.getMetrics(), PipelineApplication::handleUncaughtException, rifProcessor, rifLoader);
CcwRifLoadJob ccwRifLoadJob = new CcwRifLoadJob(appState.getMetrics(), loadOptions.getExtractionOptions(), s3TaskManager, dataSetMonitorListener);
return ccwRifLoadJob;
}
use of gov.cms.bfd.pipeline.ccw.rif.extract.s3.task.S3TaskManager in project beneficiary-fhir-data by CMSgov.
the class ManifestEntryDownloadTaskIT method testMD5ChkSum.
/**
* Test to ensure the MD5ChkSum of the downloaded S3 file matches the generated MD5ChkSum value
*/
@SuppressWarnings("deprecation")
@Test
public void testMD5ChkSum() throws Exception {
AmazonS3 s3Client = S3Utilities.createS3Client(new ExtractionOptions("foo"));
Bucket bucket = null;
try {
bucket = DataSetTestUtilities.createTestBucket(s3Client);
ExtractionOptions options = new ExtractionOptions(bucket.getName());
LOGGER.info("Bucket created: '{}:{}'", s3Client.getS3AccountOwner().getDisplayName(), bucket.getName());
DataSetManifest manifest = new DataSetManifest(Instant.now(), 0, new DataSetManifestEntry("beneficiaries.rif", RifFileType.BENEFICIARY));
// upload beneficiary sample file to S3 bucket created above
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifest));
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifest, manifest.getEntries().get(0), StaticRifResource.SAMPLE_A_BENES.getResourceUrl()));
// download file from S3 that was just uploaded above
GetObjectRequest objectRequest = new GetObjectRequest(bucket.getName(), String.format("%s/%s/%s", CcwRifLoadJob.S3_PREFIX_PENDING_DATA_SETS, manifest.getEntries().get(0).getParentManifest().getTimestampText(), manifest.getEntries().get(0).getName()));
Path localTempFile = Files.createTempFile("data-pipeline-s3-temp", ".rif");
s3TaskManager = new S3TaskManager(PipelineTestUtils.get().getPipelineApplicationState().getMetrics(), new ExtractionOptions(options.getS3BucketName()));
LOGGER.info("Downloading '{}' to '{}'...", objectRequest.getKey(), localTempFile.toAbsolutePath().toString());
Download downloadHandle = s3TaskManager.getS3TransferManager().download(objectRequest, localTempFile.toFile());
downloadHandle.waitForCompletion();
InputStream downloadedInputStream = new FileInputStream(localTempFile.toString());
String generatedMD5ChkSum = ManifestEntryDownloadTask.computeMD5ChkSum(downloadedInputStream);
LOGGER.info("The generated MD5 value from Java (Base64 encoded) is:" + generatedMD5ChkSum);
String downloadedFileMD5ChkSum = downloadHandle.getObjectMetadata().getUserMetaDataOf("md5chksum");
LOGGER.info("The MD5 value from AWS S3 file's metadata is: " + downloadedFileMD5ChkSum);
assertEquals(downloadedFileMD5ChkSum, generatedMD5ChkSum, "Checksum doesn't match on downloaded file " + objectRequest.getKey());
LOGGER.info("Downloaded '{}' to '{}'.", objectRequest.getKey(), localTempFile.toAbsolutePath().toString());
} catch (IOException e) {
throw new UncheckedIOException(e);
} catch (AmazonClientException e) {
throw new AwsFailureException(e);
} catch (InterruptedException e) {
// Shouldn't happen, as our apps don't use thread interrupts.
throw new BadCodeMonkeyException(e);
} finally {
if (bucket != null)
DataSetTestUtilities.deleteObjectsAndBucket(s3Client, bucket);
}
}
Aggregations