Search in sources :

Example 1 with ExtractionOptions

use of gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions in project beneficiary-fhir-data by CMSgov.

the class SyntheticDataUploader3 method main.

/**
 * Pushes the synthetic data from {@link SyntheticDataFixer3} up to S3, replacing any versions
 * that are already there.
 *
 * @param args (not used)
 * @throws Exception Any {@link Exception}s encountered will be bubbled up, halting the
 *     application.
 */
public static void main(String[] args) throws Exception {
    ExtractionOptions options = new ExtractionOptions(String.format("bb-test-%d", new Random().nextInt(1000)));
    AmazonS3 s3Client = S3Utilities.createS3Client(options);
    LOGGER.info("Uploading fixed data...");
    uploadSyntheticData(s3Client, TestDataSetLocation.SYNTHETIC_DATA.getS3KeyPrefix(), syntheticDataFile -> syntheticDataFile.getFixedFilePath());
    LOGGER.info("Uploaded all data.");
}
Also used : AmazonS3(com.amazonaws.services.s3.AmazonS3) Random(java.util.Random) ExtractionOptions(gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions)

Example 2 with ExtractionOptions

use of gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions in project beneficiary-fhir-data by CMSgov.

the class CcwRifLoadJobIT method multipleDataSetsTest.

/**
 * Tests {@link CcwRifLoadJob} when run against an empty bucket.
 *
 * @throws Exception (exceptions indicate test failure)
 */
@Test
public void multipleDataSetsTest() throws Exception {
    AmazonS3 s3Client = S3Utilities.createS3Client(new ExtractionOptions("foo"));
    Bucket bucket = null;
    try {
        /*
       * Create the (empty) bucket to run against, and populate it with
       * two data sets.
       */
        bucket = DataSetTestUtilities.createTestBucket(s3Client);
        ExtractionOptions options = new ExtractionOptions(bucket.getName(), Optional.empty(), Optional.of(1));
        LOGGER.info("Bucket created: '{}:{}'", s3Client.getS3AccountOwner().getDisplayName(), bucket.getName());
        DataSetManifest manifestA = new DataSetManifest(Instant.now().minus(1L, ChronoUnit.HOURS), 0, new DataSetManifestEntry("beneficiaries.rif", RifFileType.BENEFICIARY));
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestA));
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestA, manifestA.getEntries().get(0), StaticRifResource.SAMPLE_A_BENES.getResourceUrl()));
        DataSetManifest manifestB = new DataSetManifest(manifestA.getTimestampText(), 1, new DataSetManifestEntry("pde.rif", RifFileType.PDE));
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestB));
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestB, manifestB.getEntries().get(0), StaticRifResource.SAMPLE_A_BENES.getResourceUrl()));
        DataSetManifest manifestC = new DataSetManifest(Instant.now(), 0, new DataSetManifestEntry("carrier.rif", RifFileType.CARRIER));
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestC));
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestC, manifestC.getEntries().get(0), StaticRifResource.SAMPLE_A_CARRIER.getResourceUrl()));
        // Run the job.
        MockDataSetMonitorListener listener = new MockDataSetMonitorListener();
        S3TaskManager s3TaskManager = new S3TaskManager(PipelineTestUtils.get().getPipelineApplicationState().getMetrics(), options);
        CcwRifLoadJob ccwJob = new CcwRifLoadJob(PipelineTestUtils.get().getPipelineApplicationState().getMetrics(), options, s3TaskManager, listener);
        ccwJob.call();
        // Verify what was handed off to the DataSetMonitorListener.
        assertEquals(0, listener.getNoDataAvailableEvents());
        assertEquals(1, listener.getDataEvents().size());
        assertEquals(manifestA.getTimestamp(), listener.getDataEvents().get(0).getTimestamp());
        assertEquals(manifestA.getEntries().size(), listener.getDataEvents().get(0).getFileEvents().size());
        assertEquals(0, listener.getErrorEvents().size());
        /*
       * Verify that the first data set was renamed and the second is
       * still there.
       */
        DataSetTestUtilities.waitForBucketObjectCount(s3Client, bucket, CcwRifLoadJob.S3_PREFIX_PENDING_DATA_SETS, 1 + manifestB.getEntries().size() + 1 + manifestC.getEntries().size(), java.time.Duration.ofSeconds(10));
        DataSetTestUtilities.waitForBucketObjectCount(s3Client, bucket, CcwRifLoadJob.S3_PREFIX_COMPLETED_DATA_SETS, 1 + manifestA.getEntries().size(), java.time.Duration.ofSeconds(10));
    } finally {
        if (bucket != null)
            DataSetTestUtilities.deleteObjectsAndBucket(s3Client, bucket);
    }
}
Also used : AmazonS3(com.amazonaws.services.s3.AmazonS3) DataSetManifest(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest) S3TaskManager(gov.cms.bfd.pipeline.ccw.rif.extract.s3.task.S3TaskManager) Bucket(com.amazonaws.services.s3.model.Bucket) ExtractionOptions(gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions) MockDataSetMonitorListener(gov.cms.bfd.pipeline.ccw.rif.extract.s3.MockDataSetMonitorListener) DataSetManifestEntry(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestEntry) Test(org.junit.jupiter.api.Test)

Example 3 with ExtractionOptions

use of gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions in project beneficiary-fhir-data by CMSgov.

the class CcwRifLoadJobIT method singleDataSetTest.

/**
 * Tests {@link CcwRifLoadJob} when run against a bucket with a single data set.
 *
 * @throws Exception (exceptions indicate test failure)
 */
@Test
public void singleDataSetTest() throws Exception {
    AmazonS3 s3Client = S3Utilities.createS3Client(new ExtractionOptions("foo"));
    Bucket bucket = null;
    try {
        /*
       * Create the (empty) bucket to run against, and populate it with a
       * data set.
       */
        bucket = DataSetTestUtilities.createTestBucket(s3Client);
        ExtractionOptions options = new ExtractionOptions(bucket.getName());
        LOGGER.info("Bucket created: '{}:{}'", s3Client.getS3AccountOwner().getDisplayName(), bucket.getName());
        DataSetManifest manifest = new DataSetManifest(Instant.now(), 0, new DataSetManifestEntry("beneficiaries.rif", RifFileType.BENEFICIARY), new DataSetManifestEntry("carrier.rif", RifFileType.CARRIER));
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifest));
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifest, manifest.getEntries().get(0), StaticRifResource.SAMPLE_A_BENES.getResourceUrl()));
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifest, manifest.getEntries().get(1), StaticRifResource.SAMPLE_A_CARRIER.getResourceUrl()));
        // Run the job.
        MockDataSetMonitorListener listener = new MockDataSetMonitorListener();
        S3TaskManager s3TaskManager = new S3TaskManager(PipelineTestUtils.get().getPipelineApplicationState().getMetrics(), options);
        CcwRifLoadJob ccwJob = new CcwRifLoadJob(PipelineTestUtils.get().getPipelineApplicationState().getMetrics(), options, s3TaskManager, listener);
        ccwJob.call();
        // Verify what was handed off to the DataSetMonitorListener.
        assertEquals(0, listener.getNoDataAvailableEvents());
        assertEquals(1, listener.getDataEvents().size());
        assertEquals(manifest.getTimestamp(), listener.getDataEvents().get(0).getTimestamp());
        assertEquals(manifest.getEntries().size(), listener.getDataEvents().get(0).getFileEvents().size());
        assertEquals(0, listener.getErrorEvents().size());
        // Verify that the data set was renamed.
        DataSetTestUtilities.waitForBucketObjectCount(s3Client, bucket, CcwRifLoadJob.S3_PREFIX_PENDING_DATA_SETS, 0, java.time.Duration.ofSeconds(10));
        DataSetTestUtilities.waitForBucketObjectCount(s3Client, bucket, CcwRifLoadJob.S3_PREFIX_COMPLETED_DATA_SETS, 1 + manifest.getEntries().size(), java.time.Duration.ofSeconds(10));
    } finally {
        if (bucket != null)
            DataSetTestUtilities.deleteObjectsAndBucket(s3Client, bucket);
    }
}
Also used : AmazonS3(com.amazonaws.services.s3.AmazonS3) DataSetManifest(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest) S3TaskManager(gov.cms.bfd.pipeline.ccw.rif.extract.s3.task.S3TaskManager) Bucket(com.amazonaws.services.s3.model.Bucket) ExtractionOptions(gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions) MockDataSetMonitorListener(gov.cms.bfd.pipeline.ccw.rif.extract.s3.MockDataSetMonitorListener) DataSetManifestEntry(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestEntry) Test(org.junit.jupiter.api.Test)

Example 4 with ExtractionOptions

use of gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions in project beneficiary-fhir-data by CMSgov.

the class ManifestEntryDownloadTaskIT method testMD5ChkSum.

/**
 * Test to ensure the MD5ChkSum of the downloaded S3 file matches the generated MD5ChkSum value
 */
@SuppressWarnings("deprecation")
@Test
public void testMD5ChkSum() throws Exception {
    AmazonS3 s3Client = S3Utilities.createS3Client(new ExtractionOptions("foo"));
    Bucket bucket = null;
    try {
        bucket = DataSetTestUtilities.createTestBucket(s3Client);
        ExtractionOptions options = new ExtractionOptions(bucket.getName());
        LOGGER.info("Bucket created: '{}:{}'", s3Client.getS3AccountOwner().getDisplayName(), bucket.getName());
        DataSetManifest manifest = new DataSetManifest(Instant.now(), 0, new DataSetManifestEntry("beneficiaries.rif", RifFileType.BENEFICIARY));
        // upload beneficiary sample file to S3 bucket created above
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifest));
        s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifest, manifest.getEntries().get(0), StaticRifResource.SAMPLE_A_BENES.getResourceUrl()));
        // download file from S3 that was just uploaded above
        GetObjectRequest objectRequest = new GetObjectRequest(bucket.getName(), String.format("%s/%s/%s", CcwRifLoadJob.S3_PREFIX_PENDING_DATA_SETS, manifest.getEntries().get(0).getParentManifest().getTimestampText(), manifest.getEntries().get(0).getName()));
        Path localTempFile = Files.createTempFile("data-pipeline-s3-temp", ".rif");
        s3TaskManager = new S3TaskManager(PipelineTestUtils.get().getPipelineApplicationState().getMetrics(), new ExtractionOptions(options.getS3BucketName()));
        LOGGER.info("Downloading '{}' to '{}'...", objectRequest.getKey(), localTempFile.toAbsolutePath().toString());
        Download downloadHandle = s3TaskManager.getS3TransferManager().download(objectRequest, localTempFile.toFile());
        downloadHandle.waitForCompletion();
        InputStream downloadedInputStream = new FileInputStream(localTempFile.toString());
        String generatedMD5ChkSum = ManifestEntryDownloadTask.computeMD5ChkSum(downloadedInputStream);
        LOGGER.info("The generated MD5 value from Java (Base64 encoded) is:" + generatedMD5ChkSum);
        String downloadedFileMD5ChkSum = downloadHandle.getObjectMetadata().getUserMetaDataOf("md5chksum");
        LOGGER.info("The MD5 value from AWS S3 file's metadata is: " + downloadedFileMD5ChkSum);
        assertEquals(downloadedFileMD5ChkSum, generatedMD5ChkSum, "Checksum doesn't match on downloaded file " + objectRequest.getKey());
        LOGGER.info("Downloaded '{}' to '{}'.", objectRequest.getKey(), localTempFile.toAbsolutePath().toString());
    } catch (IOException e) {
        throw new UncheckedIOException(e);
    } catch (AmazonClientException e) {
        throw new AwsFailureException(e);
    } catch (InterruptedException e) {
        // Shouldn't happen, as our apps don't use thread interrupts.
        throw new BadCodeMonkeyException(e);
    } finally {
        if (bucket != null)
            DataSetTestUtilities.deleteObjectsAndBucket(s3Client, bucket);
    }
}
Also used : Path(java.nio.file.Path) AmazonS3(com.amazonaws.services.s3.AmazonS3) DataSetManifest(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest) BadCodeMonkeyException(gov.cms.bfd.sharedutils.exceptions.BadCodeMonkeyException) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) AmazonClientException(com.amazonaws.AmazonClientException) ExtractionOptions(gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) AwsFailureException(gov.cms.bfd.pipeline.ccw.rif.extract.exceptions.AwsFailureException) FileInputStream(java.io.FileInputStream) Bucket(com.amazonaws.services.s3.model.Bucket) GetObjectRequest(com.amazonaws.services.s3.model.GetObjectRequest) Download(com.amazonaws.services.s3.transfer.Download) DataSetManifestEntry(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestEntry) Test(org.junit.jupiter.api.Test)

Example 5 with ExtractionOptions

use of gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions in project beneficiary-fhir-data by CMSgov.

the class DataSetSubsetter method main.

/**
 * The application entry point that can be used to run the {@link DataSetSubsetter}.
 *
 * @param args (not used)
 * @throws Exception Any exceptions thrown will be bubbled up, terminating the app.
 */
public static void main(String[] args) throws Exception {
    /*
     * From the original source data set of 1M beneficiaries and their
     * claims, create subsets going all the way down by powers of ten. This
     * gives test authors lots of good options for how much data to test
     * against. Note that on Karl's `jordan-u` system, this took 5.5h to
     * run.
     */
    for (int beneCount = 1000000; beneCount >= 10; beneCount /= 10) {
        // Grab the source and target constants.
        final int sourceBeneCount = beneCount;
        final int targetBeneCount = beneCount / 10;
        TestDataSetLocation sourceDataSet = Arrays.stream(TestDataSetLocation.class.getEnumConstants()).filter(c -> c.name().matches("DUMMY_DATA_" + sourceBeneCount + "_BENES")).findAny().get();
        TestDataSetLocation targetDataSet = Arrays.stream(TestDataSetLocation.class.getEnumConstants()).filter(c -> c.name().matches("DUMMY_DATA_" + targetBeneCount + "_BENES")).findAny().get();
        // Figure out what directories to store the source in locally.
        Path outputDirectory = Paths.get(".", "test-data-random");
        Files.createDirectories(outputDirectory);
        String sourceDataSetId = Arrays.stream(sourceDataSet.getS3KeyPrefix().split("/")).reduce((a, b) -> b).get();
        Path sourceDataSetDirectory = outputDirectory.resolve(sourceDataSetId);
        // Download the source data set and build the target from it.
        ExtractionOptions options = new ExtractionOptions(sourceDataSet.getS3BucketName());
        String targetDataSetId = Arrays.stream(targetDataSet.getS3KeyPrefix().split("/")).reduce((a, b) -> b).get();
        Path targetDataSetDirectory = outputDirectory.resolve(targetDataSetId);
        Instant targetDataSetTimestamp = Instant.parse(targetDataSetId.replaceFirst("\\d+-beneficiaries-", ""));
        try (IDataSetWriter output = new LocalDataSetWriter(targetDataSetDirectory, targetDataSetTimestamp)) {
            Files.createDirectories(sourceDataSetDirectory);
            List<RifFile> rifFiles = downloadDataSet(options, sourceDataSetId, sourceDataSetDirectory);
            DataSetSubsetter.createSubset(output, targetBeneCount, rifFiles);
        }
    }
}
Also used : Path(java.nio.file.Path) Arrays(java.util.Arrays) CarrierClaimColumn(gov.cms.bfd.model.rif.CarrierClaimColumn) RifFileType(gov.cms.bfd.model.rif.RifFileType) S3RifFile(gov.cms.bfd.pipeline.ccw.rif.extract.s3.S3RifFile) S3Utilities(gov.cms.bfd.pipeline.ccw.rif.extract.s3.S3Utilities) LoggerFactory(org.slf4j.LoggerFactory) SNFClaimColumn(gov.cms.bfd.model.rif.SNFClaimColumn) HHAClaimColumn(gov.cms.bfd.model.rif.HHAClaimColumn) CSVFormat(org.apache.commons.csv.CSVFormat) Map(java.util.Map) CSVParser(org.apache.commons.csv.CSVParser) Path(java.nio.file.Path) TransferManagerBuilder(com.amazonaws.services.s3.transfer.TransferManagerBuilder) InpatientClaimColumn(gov.cms.bfd.model.rif.InpatientClaimColumn) Set(java.util.Set) RifFile(gov.cms.bfd.model.rif.RifFile) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) JAXBException(javax.xml.bind.JAXBException) UncheckedIOException(java.io.UncheckedIOException) List(java.util.List) UncheckedJaxbException(gov.cms.bfd.sharedutils.exceptions.UncheckedJaxbException) Entry(java.util.Map.Entry) RifParsingUtils(gov.cms.bfd.model.rif.parse.RifParsingUtils) AmazonClientException(com.amazonaws.AmazonClientException) BeneficiaryColumn(gov.cms.bfd.model.rif.BeneficiaryColumn) CSVPrinter(org.apache.commons.csv.CSVPrinter) TransferManager(com.amazonaws.services.s3.transfer.TransferManager) LocalRifFile(gov.cms.bfd.pipeline.ccw.rif.extract.LocalRifFile) OutpatientClaimColumn(gov.cms.bfd.model.rif.OutpatientClaimColumn) HospiceClaimColumn(gov.cms.bfd.model.rif.HospiceClaimColumn) Marshaller(javax.xml.bind.Marshaller) HashMap(java.util.HashMap) ExtractionOptions(gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions) Download(com.amazonaws.services.s3.transfer.Download) ArrayList(java.util.ArrayList) DataSetManifestEntry(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestEntry) HashSet(java.util.HashSet) TestDataSetLocation(gov.cms.bfd.model.rif.samples.TestDataSetLocation) AmazonS3(com.amazonaws.services.s3.AmazonS3) PartDEventColumn(gov.cms.bfd.model.rif.PartDEventColumn) JAXBContext(javax.xml.bind.JAXBContext) Unmarshaller(javax.xml.bind.Unmarshaller) Logger(org.slf4j.Logger) Files(java.nio.file.Files) FileWriter(java.io.FileWriter) IOException(java.io.IOException) DataSetManifest(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest) Paths(java.nio.file.Paths) DMEClaimColumn(gov.cms.bfd.model.rif.DMEClaimColumn) Collections(java.util.Collections) S3RifFile(gov.cms.bfd.pipeline.ccw.rif.extract.s3.S3RifFile) RifFile(gov.cms.bfd.model.rif.RifFile) LocalRifFile(gov.cms.bfd.pipeline.ccw.rif.extract.LocalRifFile) TestDataSetLocation(gov.cms.bfd.model.rif.samples.TestDataSetLocation) Instant(java.time.Instant) ExtractionOptions(gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions)

Aggregations

ExtractionOptions (gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions)10 AmazonS3 (com.amazonaws.services.s3.AmazonS3)9 Bucket (com.amazonaws.services.s3.model.Bucket)5 DataSetManifest (gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest)5 DataSetManifestEntry (gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestEntry)5 Test (org.junit.jupiter.api.Test)5 MockDataSetMonitorListener (gov.cms.bfd.pipeline.ccw.rif.extract.s3.MockDataSetMonitorListener)4 S3TaskManager (gov.cms.bfd.pipeline.ccw.rif.extract.s3.task.S3TaskManager)4 AmazonClientException (com.amazonaws.AmazonClientException)3 Random (java.util.Random)3 Download (com.amazonaws.services.s3.transfer.Download)2 RifFileType (gov.cms.bfd.model.rif.RifFileType)2 IOException (java.io.IOException)2 UncheckedIOException (java.io.UncheckedIOException)2 Path (java.nio.file.Path)2 DefaultAWSCredentialsProviderChain (com.amazonaws.auth.DefaultAWSCredentialsProviderChain)1 GetObjectRequest (com.amazonaws.services.s3.model.GetObjectRequest)1 TransferManager (com.amazonaws.services.s3.transfer.TransferManager)1 TransferManagerBuilder (com.amazonaws.services.s3.transfer.TransferManagerBuilder)1 BeneficiaryColumn (gov.cms.bfd.model.rif.BeneficiaryColumn)1