Examples with S3TaskManager - gov.cms.bfd.pipeline.ccw.rif.extract.s3.task.S3TaskManager

Example 1 with S3TaskManager

use of gov.cms.bfd.pipeline.ccw.rif.extract.s3.task.S3TaskManager in project beneficiary-fhir-data by CMSgov.

the class CcwRifLoadJobIT method multipleDataSetsTest.

/**
 * Tests {@link CcwRifLoadJob} when run against an empty bucket.
 *
 * @throws Exception (exceptions indicate test failure)
 */
@Test
public void multipleDataSetsTest() throws Exception {
    AmazonS3 s3Client = S3Utilities.createS3Client(new ExtractionOptions("foo"));
    Bucket bucket = null;
    try {
        /*
       * Create the (empty) bucket to run against, and populate it with
       * two data sets.
       */
        bucket = DataSetTestUtilities.createTestBucket(s3Client);
        ExtractionOptions options = new ExtractionOptions(bucket.getName(), Optional.empty(), Optional.of(1));
        LOGGER.info("Bucket created: '{}:{}'", s3Client.getS3AccountOwner().getDisplayName(), bucket.getName());
        DataSetManifest manifestA = new DataSetManifest(Instant.now().minus(1L, ChronoUnit.HOURS), 0, new DataSetManifestEntry("beneficiaries.rif", RifFileType.BENEFICIARY));
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestA));
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestA, manifestA.getEntries().get(0), StaticRifResource.SAMPLE_A_BENES.getResourceUrl()));
        DataSetManifest manifestB = new DataSetManifest(manifestA.getTimestampText(), 1, new DataSetManifestEntry("pde.rif", RifFileType.PDE));
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestB));
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestB, manifestB.getEntries().get(0), StaticRifResource.SAMPLE_A_BENES.getResourceUrl()));
        DataSetManifest manifestC = new DataSetManifest(Instant.now(), 0, new DataSetManifestEntry("carrier.rif", RifFileType.CARRIER));
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestC));
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestC, manifestC.getEntries().get(0), StaticRifResource.SAMPLE_A_CARRIER.getResourceUrl()));
        // Run the job.
        MockDataSetMonitorListener listener = new MockDataSetMonitorListener();
        S3TaskManager s3TaskManager = new S3TaskManager(PipelineTestUtils.get().getPipelineApplicationState().getMetrics(), options);
        CcwRifLoadJob ccwJob = new CcwRifLoadJob(PipelineTestUtils.get().getPipelineApplicationState().getMetrics(), options, s3TaskManager, listener);
        ccwJob.call();
        // Verify what was handed off to the DataSetMonitorListener.
        assertEquals(0, listener.getNoDataAvailableEvents());
        assertEquals(1, listener.getDataEvents().size());
        assertEquals(manifestA.getTimestamp(), listener.getDataEvents().get(0).getTimestamp());
        assertEquals(manifestA.getEntries().size(), listener.getDataEvents().get(0).getFileEvents().size());
        assertEquals(0, listener.getErrorEvents().size());
        /*
       * Verify that the first data set was renamed and the second is
       * still there.
       */
        DataSetTestUtilities.waitForBucketObjectCount(s3Client, bucket, CcwRifLoadJob.S3_PREFIX_PENDING_DATA_SETS, 1 + manifestB.getEntries().size() + 1 + manifestC.getEntries().size(), java.time.Duration.ofSeconds(10));
        DataSetTestUtilities.waitForBucketObjectCount(s3Client, bucket, CcwRifLoadJob.S3_PREFIX_COMPLETED_DATA_SETS, 1 + manifestA.getEntries().size(), java.time.Duration.ofSeconds(10));
    } finally {
        if (bucket != null)
            DataSetTestUtilities.deleteObjectsAndBucket(s3Client, bucket);
    }
}

Also used : AmazonS3(com.amazonaws.services.s3.AmazonS3) DataSetManifest(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest) S3TaskManager(gov.cms.bfd.pipeline.ccw.rif.extract.s3.task.S3TaskManager) Bucket(com.amazonaws.services.s3.model.Bucket) ExtractionOptions(gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions) MockDataSetMonitorListener(gov.cms.bfd.pipeline.ccw.rif.extract.s3.MockDataSetMonitorListener) DataSetManifestEntry(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestEntry) Test(org.junit.jupiter.api.Test)

Example 2 with S3TaskManager

use of gov.cms.bfd.pipeline.ccw.rif.extract.s3.task.S3TaskManager in project beneficiary-fhir-data by CMSgov.

the class CcwRifLoadJobIT method singleDataSetTest.

/**
 * Tests {@link CcwRifLoadJob} when run against a bucket with a single data set.
 *
 * @throws Exception (exceptions indicate test failure)
 */
@Test
public void singleDataSetTest() throws Exception {
    AmazonS3 s3Client = S3Utilities.createS3Client(new ExtractionOptions("foo"));
    Bucket bucket = null;
    try {
        /*
       * Create the (empty) bucket to run against, and populate it with a
       * data set.
       */
        bucket = DataSetTestUtilities.createTestBucket(s3Client);
        ExtractionOptions options = new ExtractionOptions(bucket.getName());
        LOGGER.info("Bucket created: '{}:{}'", s3Client.getS3AccountOwner().getDisplayName(), bucket.getName());
        DataSetManifest manifest = new DataSetManifest(Instant.now(), 0, new DataSetManifestEntry("beneficiaries.rif", RifFileType.BENEFICIARY), new DataSetManifestEntry("carrier.rif", RifFileType.CARRIER));
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifest));
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifest, manifest.getEntries().get(0), StaticRifResource.SAMPLE_A_BENES.getResourceUrl()));
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifest, manifest.getEntries().get(1), StaticRifResource.SAMPLE_A_CARRIER.getResourceUrl()));
        // Run the job.
        MockDataSetMonitorListener listener = new MockDataSetMonitorListener();
        S3TaskManager s3TaskManager = new S3TaskManager(PipelineTestUtils.get().getPipelineApplicationState().getMetrics(), options);
        CcwRifLoadJob ccwJob = new CcwRifLoadJob(PipelineTestUtils.get().getPipelineApplicationState().getMetrics(), options, s3TaskManager, listener);
        ccwJob.call();
        // Verify what was handed off to the DataSetMonitorListener.
        assertEquals(0, listener.getNoDataAvailableEvents());
        assertEquals(1, listener.getDataEvents().size());
        assertEquals(manifest.getTimestamp(), listener.getDataEvents().get(0).getTimestamp());
        assertEquals(manifest.getEntries().size(), listener.getDataEvents().get(0).getFileEvents().size());
        assertEquals(0, listener.getErrorEvents().size());
        // Verify that the data set was renamed.
        DataSetTestUtilities.waitForBucketObjectCount(s3Client, bucket, CcwRifLoadJob.S3_PREFIX_PENDING_DATA_SETS, 0, java.time.Duration.ofSeconds(10));
        DataSetTestUtilities.waitForBucketObjectCount(s3Client, bucket, CcwRifLoadJob.S3_PREFIX_COMPLETED_DATA_SETS, 1 + manifest.getEntries().size(), java.time.Duration.ofSeconds(10));
    } finally {
        if (bucket != null)
            DataSetTestUtilities.deleteObjectsAndBucket(s3Client, bucket);
    }
}

Example 3 with S3TaskManager

use of gov.cms.bfd.pipeline.ccw.rif.extract.s3.task.S3TaskManager in project beneficiary-fhir-data by CMSgov.

the class CcwRifLoadJob method call.

/**
 * @see gov.cms.bfd.pipeline.sharedutils.PipelineJob#call()
 */
@Override
public PipelineJobOutcome call() throws Exception {
    LOGGER.debug("Scanning for data sets to process...");
    // Update the queue from S3.
    dataSetQueue.updatePendingDataSets();
    // If no manifest was found, we're done (until next time).
    if (dataSetQueue.isEmpty()) {
        LOGGER.debug(LOG_MESSAGE_NO_DATA_SETS);
        listener.noDataAvailable();
        return PipelineJobOutcome.NOTHING_TO_DO;
    }
    // We've found the oldest manifest.
    DataSetManifest manifestToProcess = dataSetQueue.getNextDataSetToProcess().get();
    LOGGER.info("Found data set to process: '{}'." + " There were '{}' total pending data sets and '{}' completed ones.", manifestToProcess.toString(), dataSetQueue.getPendingManifestsCount(), dataSetQueue.getCompletedManifestsCount().get());
    /*
     * We've got a data set to process. However, it might still be uploading
     * to S3, so we need to wait for that to complete before we start
     * processing it.
     */
    boolean alreadyLoggedWaitingEvent = false;
    while (!dataSetIsAvailable(manifestToProcess)) {
        /*
       * We're very patient here, so we keep looping, but it's prudent to
       * pause between each iteration. TODO should eventually time out,
       * once we know how long transfers might take
       */
        try {
            if (!alreadyLoggedWaitingEvent) {
                LOGGER.info("Data set not ready. Waiting for it to finish uploading...");
                alreadyLoggedWaitingEvent = true;
            }
            Thread.sleep(1000 * 1);
        } catch (InterruptedException e) {
            /*
         * Many Java applications use InterruptedExceptions to signal
         * that a thread should stop what it's doing ASAP. This app
         * doesn't, so this is unexpected, and accordingly, we don't
         * know what to do. Safest bet is to blow up.
         */
            throw new RuntimeException(e);
        }
    }
    /*
     * Huzzah! We've got a data set to process and we've verified it's all there
     * waiting for us in S3. Now convert it into a RifFilesEvent (containing a List
     * of asynchronously-downloading S3RifFiles.
     */
    LOGGER.info(LOG_MESSAGE_DATA_SET_READY);
    List<S3RifFile> rifFiles = manifestToProcess.getEntries().stream().map(manifestEntry -> new S3RifFile(appMetrics, manifestEntry, s3TaskManager.downloadAsync(manifestEntry))).collect(Collectors.toList());
    RifFilesEvent rifFilesEvent = new RifFilesEvent(manifestToProcess.getTimestamp(), new ArrayList<>(rifFiles));
    /*
     * To save time for the next data set, peek ahead at it. If it's available and
     * it looks like there's enough disk space, start downloading it early in the
     * background.
     */
    Optional<DataSetManifest> secondManifestToProcess = dataSetQueue.getSecondDataSetToProcess();
    if (secondManifestToProcess.isPresent() && dataSetIsAvailable(secondManifestToProcess.get())) {
        Path tmpdir = Paths.get(System.getProperty("java.io.tmpdir"));
        long usableFreeTempSpace;
        try {
            usableFreeTempSpace = Files.getFileStore(tmpdir).getUsableSpace();
        } catch (IOException e) {
            throw new UncheckedIOException(e);
        }
        if (usableFreeTempSpace >= (50 * GIGA)) {
            secondManifestToProcess.get().getEntries().stream().forEach(manifestEntry -> s3TaskManager.downloadAsync(manifestEntry));
        }
    }
    /*
     * Now we hand that off to the DataSetMonitorListener, to do the *real*
     * work of actually processing that data set. It's important that we
     * block until it's completed, in order to ensure that we don't end up
     * processing multiple data sets in parallel (which would lead to data
     * consistency problems).
     */
    listener.dataAvailable(rifFilesEvent);
    LOGGER.info(LOG_MESSAGE_DATA_SET_COMPLETE);
    /*
     * Now that the data set has been processed, we need to ensure that we
     * don't end up processing it again. We ensure this two ways: 1) we keep
     * a list of the data sets most recently processed, and 2) we rename the
     * S3 objects that comprise that data set. (#1 is required as S3
     * deletes/moves are only *eventually* consistent, so #2 may not take
     * effect right away.)
     */
    rifFiles.stream().forEach(f -> f.cleanupTempFile());
    dataSetQueue.markProcessed(manifestToProcess);
    s3TaskManager.submit(new DataSetMoveTask(s3TaskManager, options, manifestToProcess));
    return PipelineJobOutcome.WORK_DONE;
}

Also used : S3RifFile(gov.cms.bfd.pipeline.ccw.rif.extract.s3.S3RifFile) NullPipelineJobArguments(gov.cms.bfd.pipeline.sharedutils.NullPipelineJobArguments) LoggerFactory(org.slf4j.LoggerFactory) PipelineJob(gov.cms.bfd.pipeline.sharedutils.PipelineJob) ExtractionOptions(gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions) DataSetMonitorListener(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetMonitorListener) ListObjectsV2Result(com.amazonaws.services.s3.model.ListObjectsV2Result) ArrayList(java.util.ArrayList) DataSetManifestEntry(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestEntry) S3TaskManager(gov.cms.bfd.pipeline.ccw.rif.extract.s3.task.S3TaskManager) HashSet(java.util.HashSet) ListObjectsV2Request(com.amazonaws.services.s3.model.ListObjectsV2Request) PipelineJobSchedule(gov.cms.bfd.pipeline.sharedutils.PipelineJobSchedule) DataSetMoveTask(gov.cms.bfd.pipeline.ccw.rif.extract.s3.task.DataSetMoveTask) Path(java.nio.file.Path) PipelineJobOutcome(gov.cms.bfd.pipeline.sharedutils.PipelineJobOutcome) MetricRegistry(com.codahale.metrics.MetricRegistry) Logger(org.slf4j.Logger) Files(java.nio.file.Files) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) DataSetManifest(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest) RifFilesEvent(gov.cms.bfd.model.rif.RifFilesEvent) UncheckedIOException(java.io.UncheckedIOException) List(java.util.List) ChronoUnit(java.time.temporal.ChronoUnit) Paths(java.nio.file.Paths) DataSetManifestId(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestId) Optional(java.util.Optional) DataSetQueue(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetQueue) Pattern(java.util.regex.Pattern) Path(java.nio.file.Path) DataSetManifest(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) DataSetMoveTask(gov.cms.bfd.pipeline.ccw.rif.extract.s3.task.DataSetMoveTask) S3RifFile(gov.cms.bfd.pipeline.ccw.rif.extract.s3.S3RifFile) RifFilesEvent(gov.cms.bfd.model.rif.RifFilesEvent)

Example 4 with S3TaskManager

use of gov.cms.bfd.pipeline.ccw.rif.extract.s3.task.S3TaskManager in project beneficiary-fhir-data by CMSgov.

the class PipelineApplication method createCcwRifLoadJob.

/**
 * @param loadOptions the {@link CcwRifLoadOptions} to use
 * @param appState the {@link PipelineApplicationState} to use
 * @return a {@link CcwRifLoadJob} instance for the application to use
 */
private static PipelineJob<?> createCcwRifLoadJob(CcwRifLoadOptions loadOptions, PipelineApplicationState appState) {
    /*
     * Create the services that will be used to handle each stage in the extract, transform, and
     * load process.
     */
    S3TaskManager s3TaskManager = new S3TaskManager(appState.getMetrics(), loadOptions.getExtractionOptions());
    RifFilesProcessor rifProcessor = new RifFilesProcessor();
    RifLoader rifLoader = new RifLoader(loadOptions.getLoadOptions(), appState);
    /*
     * Create the DataSetMonitorListener that will glue those stages together and run them all for
     * each data set that is found.
     */
    DataSetMonitorListener dataSetMonitorListener = new DefaultDataSetMonitorListener(appState.getMetrics(), PipelineApplication::handleUncaughtException, rifProcessor, rifLoader);
    CcwRifLoadJob ccwRifLoadJob = new CcwRifLoadJob(appState.getMetrics(), loadOptions.getExtractionOptions(), s3TaskManager, dataSetMonitorListener);
    return ccwRifLoadJob;
}

Also used : S3TaskManager(gov.cms.bfd.pipeline.ccw.rif.extract.s3.task.S3TaskManager) DataSetMonitorListener(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetMonitorListener) CcwRifLoadJob(gov.cms.bfd.pipeline.ccw.rif.CcwRifLoadJob) RifFilesProcessor(gov.cms.bfd.pipeline.ccw.rif.extract.RifFilesProcessor) RifLoader(gov.cms.bfd.pipeline.ccw.rif.load.RifLoader)

Example 5 with S3TaskManager

use of gov.cms.bfd.pipeline.ccw.rif.extract.s3.task.S3TaskManager in project beneficiary-fhir-data by CMSgov.

the class ManifestEntryDownloadTaskIT method testMD5ChkSum.

/**
 * Test to ensure the MD5ChkSum of the downloaded S3 file matches the generated MD5ChkSum value
 */
@SuppressWarnings("deprecation")
@Test
public void testMD5ChkSum() throws Exception {
    AmazonS3 s3Client = S3Utilities.createS3Client(new ExtractionOptions("foo"));
    Bucket bucket = null;
    try {
        bucket = DataSetTestUtilities.createTestBucket(s3Client);
        ExtractionOptions options = new ExtractionOptions(bucket.getName());
        LOGGER.info("Bucket created: '{}:{}'", s3Client.getS3AccountOwner().getDisplayName(), bucket.getName());
        DataSetManifest manifest = new DataSetManifest(Instant.now(), 0, new DataSetManifestEntry("beneficiaries.rif", RifFileType.BENEFICIARY));
        // upload beneficiary sample file to S3 bucket created above
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifest));
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifest, manifest.getEntries().get(0), StaticRifResource.SAMPLE_A_BENES.getResourceUrl()));
        // download file from S3 that was just uploaded above
        GetObjectRequest objectRequest = new GetObjectRequest(bucket.getName(), String.format("%s/%s/%s", CcwRifLoadJob.S3_PREFIX_PENDING_DATA_SETS, manifest.getEntries().get(0).getParentManifest().getTimestampText(), manifest.getEntries().get(0).getName()));
        Path localTempFile = Files.createTempFile("data-pipeline-s3-temp", ".rif");
        s3TaskManager = new S3TaskManager(PipelineTestUtils.get().getPipelineApplicationState().getMetrics(), new ExtractionOptions(options.getS3BucketName()));
        LOGGER.info("Downloading '{}' to '{}'...", objectRequest.getKey(), localTempFile.toAbsolutePath().toString());
        Download downloadHandle = s3TaskManager.getS3TransferManager().download(objectRequest, localTempFile.toFile());
        downloadHandle.waitForCompletion();
        InputStream downloadedInputStream = new FileInputStream(localTempFile.toString());
        String generatedMD5ChkSum = ManifestEntryDownloadTask.computeMD5ChkSum(downloadedInputStream);
        LOGGER.info("The generated MD5 value from Java (Base64 encoded) is:" + generatedMD5ChkSum);
        String downloadedFileMD5ChkSum = downloadHandle.getObjectMetadata().getUserMetaDataOf("md5chksum");
        LOGGER.info("The MD5 value from AWS S3 file's metadata is: " + downloadedFileMD5ChkSum);
        assertEquals(downloadedFileMD5ChkSum, generatedMD5ChkSum, "Checksum doesn't match on downloaded file " + objectRequest.getKey());
        LOGGER.info("Downloaded '{}' to '{}'.", objectRequest.getKey(), localTempFile.toAbsolutePath().toString());
    } catch (IOException e) {
        throw new UncheckedIOException(e);
    } catch (AmazonClientException e) {
        throw new AwsFailureException(e);
    } catch (InterruptedException e) {
        // Shouldn't happen, as our apps don't use thread interrupts.
        throw new BadCodeMonkeyException(e);
    } finally {
        if (bucket != null)
            DataSetTestUtilities.deleteObjectsAndBucket(s3Client, bucket);
    }
}

Also used : Path(java.nio.file.Path) AmazonS3(com.amazonaws.services.s3.AmazonS3) DataSetManifest(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest) BadCodeMonkeyException(gov.cms.bfd.sharedutils.exceptions.BadCodeMonkeyException) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) AmazonClientException(com.amazonaws.AmazonClientException) ExtractionOptions(gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) AwsFailureException(gov.cms.bfd.pipeline.ccw.rif.extract.exceptions.AwsFailureException) FileInputStream(java.io.FileInputStream) Bucket(com.amazonaws.services.s3.model.Bucket) GetObjectRequest(com.amazonaws.services.s3.model.GetObjectRequest) Download(com.amazonaws.services.s3.transfer.Download) DataSetManifestEntry(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestEntry) Test(org.junit.jupiter.api.Test)

Aggregations

ExtractionOptions (gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions)6 S3TaskManager (gov.cms.bfd.pipeline.ccw.rif.extract.s3.task.S3TaskManager)6 AmazonS3 (com.amazonaws.services.s3.AmazonS3)5 Bucket (com.amazonaws.services.s3.model.Bucket)5 DataSetManifest (gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest)5 DataSetManifestEntry (gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestEntry)5 Test (org.junit.jupiter.api.Test)5 MockDataSetMonitorListener (gov.cms.bfd.pipeline.ccw.rif.extract.s3.MockDataSetMonitorListener)4 DataSetMonitorListener (gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetMonitorListener)2 IOException (java.io.IOException)2 UncheckedIOException (java.io.UncheckedIOException)2 Path (java.nio.file.Path)2 AmazonClientException (com.amazonaws.AmazonClientException)1 GetObjectRequest (com.amazonaws.services.s3.model.GetObjectRequest)1 ListObjectsV2Request (com.amazonaws.services.s3.model.ListObjectsV2Request)1 ListObjectsV2Result (com.amazonaws.services.s3.model.ListObjectsV2Result)1 Download (com.amazonaws.services.s3.transfer.Download)1 MetricRegistry (com.codahale.metrics.MetricRegistry)1 RifFilesEvent (gov.cms.bfd.model.rif.RifFilesEvent)1 CcwRifLoadJob (gov.cms.bfd.pipeline.ccw.rif.CcwRifLoadJob)1