use of gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestId in project beneficiary-fhir-data by CMSgov.
the class DataSetManifestTest method manifestIdRoundtrip.
/**
* Verifies that {@link DataSetManifestId}s can be round-tripped, as expected. A regression test
* case for <a href="http://issues.hhsdevcloud.us/browse/CBBD-298">CBBD-298: Error reading some
* data set manifests in S3: "AmazonS3Exception: The specified key does not exist"</a>.
*/
@Test
public void manifestIdRoundtrip() {
String s3Key = CcwRifLoadJob.S3_PREFIX_PENDING_DATA_SETS + "/2017-07-11T00:00:00.000Z/1_manifest.xml";
DataSetManifestId manifestId = DataSetManifestId.parseManifestIdFromS3Key(s3Key);
assertEquals(s3Key, manifestId.computeS3Key(CcwRifLoadJob.S3_PREFIX_PENDING_DATA_SETS));
}
use of gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestId in project beneficiary-fhir-data by CMSgov.
the class DataSetQueue method listPendingManifests.
/**
* @return the {@link DataSetManifestId}s for the manifests that are found in S3 under the {@value
* #S3_PREFIX_PENDING_DATA_SETS} key prefix, sorted in expected processing order.
*/
private Set<DataSetManifestId> listPendingManifests() {
Timer.Context timerS3Scanning = appMetrics.timer(MetricRegistry.name(getClass().getSimpleName(), "s3Scanning")).time();
LOGGER.debug("Scanning for data sets in S3...");
Set<DataSetManifestId> manifestIds = new HashSet<>();
/*
* Request a list of all objects in the configured bucket and directory.
* (In the results, we'll be looking for the oldest manifest file, if
* any.)
*/
ListObjectsV2Request s3BucketListRequest = new ListObjectsV2Request();
s3BucketListRequest.setBucketName(options.getS3BucketName());
if (options.getS3ListMaxKeys().isPresent())
s3BucketListRequest.setMaxKeys(options.getS3ListMaxKeys().get());
/*
* S3 will return results in separate pages. Loop through all of the
* pages, looking for manifests.
*/
int completedManifestsCount = 0;
ListObjectsV2Result s3ObjectListing;
do {
s3ObjectListing = s3TaskManager.getS3Client().listObjectsV2(s3BucketListRequest);
for (S3ObjectSummary objectSummary : s3ObjectListing.getObjectSummaries()) {
String key = objectSummary.getKey();
if (CcwRifLoadJob.REGEX_PENDING_MANIFEST.matcher(key).matches()) {
/*
* We've got an object that *looks like* it might be a
* manifest file. But we need to parse the key to ensure
* that it starts with a valid timestamp.
*/
DataSetManifestId manifestId = DataSetManifestId.parseManifestIdFromS3Key(key);
if (manifestId != null)
manifestIds.add(manifestId);
} else if (CcwRifLoadJob.REGEX_COMPLETED_MANIFEST.matcher(key).matches()) {
completedManifestsCount++;
}
}
s3BucketListRequest.setContinuationToken(s3ObjectListing.getNextContinuationToken());
} while (s3ObjectListing.isTruncated());
this.completedManifestsCount = completedManifestsCount;
LOGGER.debug("Scanned for data sets in S3. Found '{}'.", manifestsToProcess.size());
timerS3Scanning.close();
return manifestIds;
}
use of gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestId in project beneficiary-fhir-data by CMSgov.
the class DataSetQueue method updatePendingDataSets.
/**
* Updates {@link #manifestsToProcess}, listing the manifests available in S3 right now, then
* adding those that weren't found before and removing those that are no longer pending.
*/
public void updatePendingDataSets() {
// Find the pending manifests.
Set<DataSetManifestId> manifestIdsPendingNow = listPendingManifests();
/*
* Add any newly discovered manifests to the list of those to be
* processed. Ignore those that are already known to be invalid or
* complete, and watch out for newly-discovered-to-be-invalid ones.
*/
Set<DataSetManifestId> newManifests = new HashSet<>(manifestIdsPendingNow);
newManifests.removeAll(knownInvalidManifests);
newManifests.removeAll(recentlyProcessedManifests);
newManifests.removeAll(manifestsToProcess.stream().map(m -> m.getId()).collect(Collectors.toSet()));
newManifests.stream().forEach(manifestId -> {
String manifestS3Key = manifestId.computeS3Key(CcwRifLoadJob.S3_PREFIX_PENDING_DATA_SETS);
DataSetManifest manifest = null;
try {
manifest = readManifest(s3TaskManager.getS3Client(), options, manifestS3Key);
} catch (JAXBException e) {
/*
* We want to terminate the ETL load process if an invalid manifest was found
* such as a incorrect version number
*/
LOGGER.error("Found data set with invalid manifest at '{}'. Load service will terminating. Error: {}", manifestS3Key, e.toString());
knownInvalidManifests.add(manifestId);
throw new RuntimeException(e);
}
// Finally, ensure that the manifest passes the options filter.
if (!options.getDataSetFilter().test(manifest)) {
LOGGER.debug("Skipping data set that doesn't pass filter: {}", manifest.toString());
return;
}
// Everything checks out. Add it to the list!
manifestsToProcess.add(manifest);
});
/*
* Any manifests that weren't found have presumably been processed and
* we should clean up the state that relates to them, to prevent memory
* leaks.
*/
for (Iterator<DataSetManifest> manifestsToProcessIterator = manifestsToProcess.iterator(); manifestsToProcessIterator.hasNext(); ) {
DataSetManifestId manifestId = manifestsToProcessIterator.next().getId();
if (!manifestIdsPendingNow.contains(manifestId)) {
manifestsToProcessIterator.remove();
knownInvalidManifests.remove(manifestId);
recentlyProcessedManifests.remove(manifestId);
s3TaskManager.cleanupOldDataSet(manifestId);
}
}
}
Aggregations