use of gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions in project beneficiary-fhir-data by CMSgov.
the class SyntheticDataUploader3 method main.
/**
* Pushes the synthetic data from {@link SyntheticDataFixer3} up to S3, replacing any versions
* that are already there.
*
* @param args (not used)
* @throws Exception Any {@link Exception}s encountered will be bubbled up, halting the
* application.
*/
public static void main(String[] args) throws Exception {
ExtractionOptions options = new ExtractionOptions(String.format("bb-test-%d", new Random().nextInt(1000)));
AmazonS3 s3Client = S3Utilities.createS3Client(options);
LOGGER.info("Uploading fixed data...");
uploadSyntheticData(s3Client, TestDataSetLocation.SYNTHETIC_DATA.getS3KeyPrefix(), syntheticDataFile -> syntheticDataFile.getFixedFilePath());
LOGGER.info("Uploaded all data.");
}
use of gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions in project beneficiary-fhir-data by CMSgov.
the class CcwRifLoadJobIT method multipleDataSetsTest.
/**
* Tests {@link CcwRifLoadJob} when run against an empty bucket.
*
* @throws Exception (exceptions indicate test failure)
*/
@Test
public void multipleDataSetsTest() throws Exception {
AmazonS3 s3Client = S3Utilities.createS3Client(new ExtractionOptions("foo"));
Bucket bucket = null;
try {
/*
* Create the (empty) bucket to run against, and populate it with
* two data sets.
*/
bucket = DataSetTestUtilities.createTestBucket(s3Client);
ExtractionOptions options = new ExtractionOptions(bucket.getName(), Optional.empty(), Optional.of(1));
LOGGER.info("Bucket created: '{}:{}'", s3Client.getS3AccountOwner().getDisplayName(), bucket.getName());
DataSetManifest manifestA = new DataSetManifest(Instant.now().minus(1L, ChronoUnit.HOURS), 0, new DataSetManifestEntry("beneficiaries.rif", RifFileType.BENEFICIARY));
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestA));
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestA, manifestA.getEntries().get(0), StaticRifResource.SAMPLE_A_BENES.getResourceUrl()));
DataSetManifest manifestB = new DataSetManifest(manifestA.getTimestampText(), 1, new DataSetManifestEntry("pde.rif", RifFileType.PDE));
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestB));
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestB, manifestB.getEntries().get(0), StaticRifResource.SAMPLE_A_BENES.getResourceUrl()));
DataSetManifest manifestC = new DataSetManifest(Instant.now(), 0, new DataSetManifestEntry("carrier.rif", RifFileType.CARRIER));
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestC));
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifestC, manifestC.getEntries().get(0), StaticRifResource.SAMPLE_A_CARRIER.getResourceUrl()));
// Run the job.
MockDataSetMonitorListener listener = new MockDataSetMonitorListener();
S3TaskManager s3TaskManager = new S3TaskManager(PipelineTestUtils.get().getPipelineApplicationState().getMetrics(), options);
CcwRifLoadJob ccwJob = new CcwRifLoadJob(PipelineTestUtils.get().getPipelineApplicationState().getMetrics(), options, s3TaskManager, listener);
ccwJob.call();
// Verify what was handed off to the DataSetMonitorListener.
assertEquals(0, listener.getNoDataAvailableEvents());
assertEquals(1, listener.getDataEvents().size());
assertEquals(manifestA.getTimestamp(), listener.getDataEvents().get(0).getTimestamp());
assertEquals(manifestA.getEntries().size(), listener.getDataEvents().get(0).getFileEvents().size());
assertEquals(0, listener.getErrorEvents().size());
/*
* Verify that the first data set was renamed and the second is
* still there.
*/
DataSetTestUtilities.waitForBucketObjectCount(s3Client, bucket, CcwRifLoadJob.S3_PREFIX_PENDING_DATA_SETS, 1 + manifestB.getEntries().size() + 1 + manifestC.getEntries().size(), java.time.Duration.ofSeconds(10));
DataSetTestUtilities.waitForBucketObjectCount(s3Client, bucket, CcwRifLoadJob.S3_PREFIX_COMPLETED_DATA_SETS, 1 + manifestA.getEntries().size(), java.time.Duration.ofSeconds(10));
} finally {
if (bucket != null)
DataSetTestUtilities.deleteObjectsAndBucket(s3Client, bucket);
}
}
use of gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions in project beneficiary-fhir-data by CMSgov.
the class CcwRifLoadJobIT method singleDataSetTest.
/**
* Tests {@link CcwRifLoadJob} when run against a bucket with a single data set.
*
* @throws Exception (exceptions indicate test failure)
*/
@Test
public void singleDataSetTest() throws Exception {
AmazonS3 s3Client = S3Utilities.createS3Client(new ExtractionOptions("foo"));
Bucket bucket = null;
try {
/*
* Create the (empty) bucket to run against, and populate it with a
* data set.
*/
bucket = DataSetTestUtilities.createTestBucket(s3Client);
ExtractionOptions options = new ExtractionOptions(bucket.getName());
LOGGER.info("Bucket created: '{}:{}'", s3Client.getS3AccountOwner().getDisplayName(), bucket.getName());
DataSetManifest manifest = new DataSetManifest(Instant.now(), 0, new DataSetManifestEntry("beneficiaries.rif", RifFileType.BENEFICIARY), new DataSetManifestEntry("carrier.rif", RifFileType.CARRIER));
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifest));
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifest, manifest.getEntries().get(0), StaticRifResource.SAMPLE_A_BENES.getResourceUrl()));
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifest, manifest.getEntries().get(1), StaticRifResource.SAMPLE_A_CARRIER.getResourceUrl()));
// Run the job.
MockDataSetMonitorListener listener = new MockDataSetMonitorListener();
S3TaskManager s3TaskManager = new S3TaskManager(PipelineTestUtils.get().getPipelineApplicationState().getMetrics(), options);
CcwRifLoadJob ccwJob = new CcwRifLoadJob(PipelineTestUtils.get().getPipelineApplicationState().getMetrics(), options, s3TaskManager, listener);
ccwJob.call();
// Verify what was handed off to the DataSetMonitorListener.
assertEquals(0, listener.getNoDataAvailableEvents());
assertEquals(1, listener.getDataEvents().size());
assertEquals(manifest.getTimestamp(), listener.getDataEvents().get(0).getTimestamp());
assertEquals(manifest.getEntries().size(), listener.getDataEvents().get(0).getFileEvents().size());
assertEquals(0, listener.getErrorEvents().size());
// Verify that the data set was renamed.
DataSetTestUtilities.waitForBucketObjectCount(s3Client, bucket, CcwRifLoadJob.S3_PREFIX_PENDING_DATA_SETS, 0, java.time.Duration.ofSeconds(10));
DataSetTestUtilities.waitForBucketObjectCount(s3Client, bucket, CcwRifLoadJob.S3_PREFIX_COMPLETED_DATA_SETS, 1 + manifest.getEntries().size(), java.time.Duration.ofSeconds(10));
} finally {
if (bucket != null)
DataSetTestUtilities.deleteObjectsAndBucket(s3Client, bucket);
}
}
use of gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions in project beneficiary-fhir-data by CMSgov.
the class ManifestEntryDownloadTaskIT method testMD5ChkSum.
/**
* Test to ensure the MD5ChkSum of the downloaded S3 file matches the generated MD5ChkSum value
*/
@SuppressWarnings("deprecation")
@Test
public void testMD5ChkSum() throws Exception {
AmazonS3 s3Client = S3Utilities.createS3Client(new ExtractionOptions("foo"));
Bucket bucket = null;
try {
bucket = DataSetTestUtilities.createTestBucket(s3Client);
ExtractionOptions options = new ExtractionOptions(bucket.getName());
LOGGER.info("Bucket created: '{}:{}'", s3Client.getS3AccountOwner().getDisplayName(), bucket.getName());
DataSetManifest manifest = new DataSetManifest(Instant.now(), 0, new DataSetManifestEntry("beneficiaries.rif", RifFileType.BENEFICIARY));
// upload beneficiary sample file to S3 bucket created above
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifest));
s3Client.putObject(DataSetTestUtilities.createPutRequest(bucket, manifest, manifest.getEntries().get(0), StaticRifResource.SAMPLE_A_BENES.getResourceUrl()));
// download file from S3 that was just uploaded above
GetObjectRequest objectRequest = new GetObjectRequest(bucket.getName(), String.format("%s/%s/%s", CcwRifLoadJob.S3_PREFIX_PENDING_DATA_SETS, manifest.getEntries().get(0).getParentManifest().getTimestampText(), manifest.getEntries().get(0).getName()));
Path localTempFile = Files.createTempFile("data-pipeline-s3-temp", ".rif");
s3TaskManager = new S3TaskManager(PipelineTestUtils.get().getPipelineApplicationState().getMetrics(), new ExtractionOptions(options.getS3BucketName()));
LOGGER.info("Downloading '{}' to '{}'...", objectRequest.getKey(), localTempFile.toAbsolutePath().toString());
Download downloadHandle = s3TaskManager.getS3TransferManager().download(objectRequest, localTempFile.toFile());
downloadHandle.waitForCompletion();
InputStream downloadedInputStream = new FileInputStream(localTempFile.toString());
String generatedMD5ChkSum = ManifestEntryDownloadTask.computeMD5ChkSum(downloadedInputStream);
LOGGER.info("The generated MD5 value from Java (Base64 encoded) is:" + generatedMD5ChkSum);
String downloadedFileMD5ChkSum = downloadHandle.getObjectMetadata().getUserMetaDataOf("md5chksum");
LOGGER.info("The MD5 value from AWS S3 file's metadata is: " + downloadedFileMD5ChkSum);
assertEquals(downloadedFileMD5ChkSum, generatedMD5ChkSum, "Checksum doesn't match on downloaded file " + objectRequest.getKey());
LOGGER.info("Downloaded '{}' to '{}'.", objectRequest.getKey(), localTempFile.toAbsolutePath().toString());
} catch (IOException e) {
throw new UncheckedIOException(e);
} catch (AmazonClientException e) {
throw new AwsFailureException(e);
} catch (InterruptedException e) {
// Shouldn't happen, as our apps don't use thread interrupts.
throw new BadCodeMonkeyException(e);
} finally {
if (bucket != null)
DataSetTestUtilities.deleteObjectsAndBucket(s3Client, bucket);
}
}
use of gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions in project beneficiary-fhir-data by CMSgov.
the class DataSetSubsetter method main.
/**
* The application entry point that can be used to run the {@link DataSetSubsetter}.
*
* @param args (not used)
* @throws Exception Any exceptions thrown will be bubbled up, terminating the app.
*/
public static void main(String[] args) throws Exception {
/*
* From the original source data set of 1M beneficiaries and their
* claims, create subsets going all the way down by powers of ten. This
* gives test authors lots of good options for how much data to test
* against. Note that on Karl's `jordan-u` system, this took 5.5h to
* run.
*/
for (int beneCount = 1000000; beneCount >= 10; beneCount /= 10) {
// Grab the source and target constants.
final int sourceBeneCount = beneCount;
final int targetBeneCount = beneCount / 10;
TestDataSetLocation sourceDataSet = Arrays.stream(TestDataSetLocation.class.getEnumConstants()).filter(c -> c.name().matches("DUMMY_DATA_" + sourceBeneCount + "_BENES")).findAny().get();
TestDataSetLocation targetDataSet = Arrays.stream(TestDataSetLocation.class.getEnumConstants()).filter(c -> c.name().matches("DUMMY_DATA_" + targetBeneCount + "_BENES")).findAny().get();
// Figure out what directories to store the source in locally.
Path outputDirectory = Paths.get(".", "test-data-random");
Files.createDirectories(outputDirectory);
String sourceDataSetId = Arrays.stream(sourceDataSet.getS3KeyPrefix().split("/")).reduce((a, b) -> b).get();
Path sourceDataSetDirectory = outputDirectory.resolve(sourceDataSetId);
// Download the source data set and build the target from it.
ExtractionOptions options = new ExtractionOptions(sourceDataSet.getS3BucketName());
String targetDataSetId = Arrays.stream(targetDataSet.getS3KeyPrefix().split("/")).reduce((a, b) -> b).get();
Path targetDataSetDirectory = outputDirectory.resolve(targetDataSetId);
Instant targetDataSetTimestamp = Instant.parse(targetDataSetId.replaceFirst("\\d+-beneficiaries-", ""));
try (IDataSetWriter output = new LocalDataSetWriter(targetDataSetDirectory, targetDataSetTimestamp)) {
Files.createDirectories(sourceDataSetDirectory);
List<RifFile> rifFiles = downloadDataSet(options, sourceDataSetId, sourceDataSetDirectory);
DataSetSubsetter.createSubset(output, targetBeneCount, rifFiles);
}
}
}
Aggregations