Search in sources :

Example 1 with DataSetManifestId

use of gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestId in project beneficiary-fhir-data by CMSgov.

the class DataSetManifestTest method manifestIdRoundtrip.

/**
 * Verifies that {@link DataSetManifestId}s can be round-tripped, as expected. A regression test
 * case for <a href="http://issues.hhsdevcloud.us/browse/CBBD-298">CBBD-298: Error reading some
 * data set manifests in S3: "AmazonS3Exception: The specified key does not exist"</a>.
 */
@Test
public void manifestIdRoundtrip() {
    String s3Key = CcwRifLoadJob.S3_PREFIX_PENDING_DATA_SETS + "/2017-07-11T00:00:00.000Z/1_manifest.xml";
    DataSetManifestId manifestId = DataSetManifestId.parseManifestIdFromS3Key(s3Key);
    assertEquals(s3Key, manifestId.computeS3Key(CcwRifLoadJob.S3_PREFIX_PENDING_DATA_SETS));
}
Also used : DataSetManifestId(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestId) Test(org.junit.jupiter.api.Test)

Example 2 with DataSetManifestId

use of gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestId in project beneficiary-fhir-data by CMSgov.

the class DataSetQueue method listPendingManifests.

/**
 * @return the {@link DataSetManifestId}s for the manifests that are found in S3 under the {@value
 *     #S3_PREFIX_PENDING_DATA_SETS} key prefix, sorted in expected processing order.
 */
private Set<DataSetManifestId> listPendingManifests() {
    Timer.Context timerS3Scanning = appMetrics.timer(MetricRegistry.name(getClass().getSimpleName(), "s3Scanning")).time();
    LOGGER.debug("Scanning for data sets in S3...");
    Set<DataSetManifestId> manifestIds = new HashSet<>();
    /*
     * Request a list of all objects in the configured bucket and directory.
     * (In the results, we'll be looking for the oldest manifest file, if
     * any.)
     */
    ListObjectsV2Request s3BucketListRequest = new ListObjectsV2Request();
    s3BucketListRequest.setBucketName(options.getS3BucketName());
    if (options.getS3ListMaxKeys().isPresent())
        s3BucketListRequest.setMaxKeys(options.getS3ListMaxKeys().get());
    /*
     * S3 will return results in separate pages. Loop through all of the
     * pages, looking for manifests.
     */
    int completedManifestsCount = 0;
    ListObjectsV2Result s3ObjectListing;
    do {
        s3ObjectListing = s3TaskManager.getS3Client().listObjectsV2(s3BucketListRequest);
        for (S3ObjectSummary objectSummary : s3ObjectListing.getObjectSummaries()) {
            String key = objectSummary.getKey();
            if (CcwRifLoadJob.REGEX_PENDING_MANIFEST.matcher(key).matches()) {
                /*
           * We've got an object that *looks like* it might be a
           * manifest file. But we need to parse the key to ensure
           * that it starts with a valid timestamp.
           */
                DataSetManifestId manifestId = DataSetManifestId.parseManifestIdFromS3Key(key);
                if (manifestId != null)
                    manifestIds.add(manifestId);
            } else if (CcwRifLoadJob.REGEX_COMPLETED_MANIFEST.matcher(key).matches()) {
                completedManifestsCount++;
            }
        }
        s3BucketListRequest.setContinuationToken(s3ObjectListing.getNextContinuationToken());
    } while (s3ObjectListing.isTruncated());
    this.completedManifestsCount = completedManifestsCount;
    LOGGER.debug("Scanned for data sets in S3. Found '{}'.", manifestsToProcess.size());
    timerS3Scanning.close();
    return manifestIds;
}
Also used : Timer(com.codahale.metrics.Timer) ListObjectsV2Request(com.amazonaws.services.s3.model.ListObjectsV2Request) ListObjectsV2Result(com.amazonaws.services.s3.model.ListObjectsV2Result) DataSetManifestId(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestId) S3ObjectSummary(com.amazonaws.services.s3.model.S3ObjectSummary) HashSet(java.util.HashSet)

Example 3 with DataSetManifestId

use of gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestId in project beneficiary-fhir-data by CMSgov.

the class DataSetQueue method updatePendingDataSets.

/**
 * Updates {@link #manifestsToProcess}, listing the manifests available in S3 right now, then
 * adding those that weren't found before and removing those that are no longer pending.
 */
public void updatePendingDataSets() {
    // Find the pending manifests.
    Set<DataSetManifestId> manifestIdsPendingNow = listPendingManifests();
    /*
     * Add any newly discovered manifests to the list of those to be
     * processed. Ignore those that are already known to be invalid or
     * complete, and watch out for newly-discovered-to-be-invalid ones.
     */
    Set<DataSetManifestId> newManifests = new HashSet<>(manifestIdsPendingNow);
    newManifests.removeAll(knownInvalidManifests);
    newManifests.removeAll(recentlyProcessedManifests);
    newManifests.removeAll(manifestsToProcess.stream().map(m -> m.getId()).collect(Collectors.toSet()));
    newManifests.stream().forEach(manifestId -> {
        String manifestS3Key = manifestId.computeS3Key(CcwRifLoadJob.S3_PREFIX_PENDING_DATA_SETS);
        DataSetManifest manifest = null;
        try {
            manifest = readManifest(s3TaskManager.getS3Client(), options, manifestS3Key);
        } catch (JAXBException e) {
            /*
                 * We want to terminate the ETL load process if an invalid manifest was found
                 * such as a incorrect version number
                 */
            LOGGER.error("Found data set with invalid manifest at '{}'. Load service will terminating. Error: {}", manifestS3Key, e.toString());
            knownInvalidManifests.add(manifestId);
            throw new RuntimeException(e);
        }
        // Finally, ensure that the manifest passes the options filter.
        if (!options.getDataSetFilter().test(manifest)) {
            LOGGER.debug("Skipping data set that doesn't pass filter: {}", manifest.toString());
            return;
        }
        // Everything checks out. Add it to the list!
        manifestsToProcess.add(manifest);
    });
    /*
     * Any manifests that weren't found have presumably been processed and
     * we should clean up the state that relates to them, to prevent memory
     * leaks.
     */
    for (Iterator<DataSetManifest> manifestsToProcessIterator = manifestsToProcess.iterator(); manifestsToProcessIterator.hasNext(); ) {
        DataSetManifestId manifestId = manifestsToProcessIterator.next().getId();
        if (!manifestIdsPendingNow.contains(manifestId)) {
            manifestsToProcessIterator.remove();
            knownInvalidManifests.remove(manifestId);
            recentlyProcessedManifests.remove(manifestId);
            s3TaskManager.cleanupOldDataSet(manifestId);
        }
    }
}
Also used : DataSetManifestId(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestId) JAXBException(javax.xml.bind.JAXBException) HashSet(java.util.HashSet)

Aggregations

DataSetManifestId (gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestId)3 HashSet (java.util.HashSet)2 ListObjectsV2Request (com.amazonaws.services.s3.model.ListObjectsV2Request)1 ListObjectsV2Result (com.amazonaws.services.s3.model.ListObjectsV2Result)1 S3ObjectSummary (com.amazonaws.services.s3.model.S3ObjectSummary)1 Timer (com.codahale.metrics.Timer)1 JAXBException (javax.xml.bind.JAXBException)1 Test (org.junit.jupiter.api.Test)1