Search in sources :

Example 1 with TestDataSetLocation

use of gov.cms.bfd.model.rif.samples.TestDataSetLocation in project beneficiary-fhir-data by CMSgov.

the class DataSetSubsetter method main.

/**
 * The application entry point that can be used to run the {@link DataSetSubsetter}.
 *
 * @param args (not used)
 * @throws Exception Any exceptions thrown will be bubbled up, terminating the app.
 */
public static void main(String[] args) throws Exception {
    /*
     * From the original source data set of 1M beneficiaries and their
     * claims, create subsets going all the way down by powers of ten. This
     * gives test authors lots of good options for how much data to test
     * against. Note that on Karl's `jordan-u` system, this took 5.5h to
     * run.
     */
    for (int beneCount = 1000000; beneCount >= 10; beneCount /= 10) {
        // Grab the source and target constants.
        final int sourceBeneCount = beneCount;
        final int targetBeneCount = beneCount / 10;
        TestDataSetLocation sourceDataSet = Arrays.stream(TestDataSetLocation.class.getEnumConstants()).filter(c -> c.name().matches("DUMMY_DATA_" + sourceBeneCount + "_BENES")).findAny().get();
        TestDataSetLocation targetDataSet = Arrays.stream(TestDataSetLocation.class.getEnumConstants()).filter(c -> c.name().matches("DUMMY_DATA_" + targetBeneCount + "_BENES")).findAny().get();
        // Figure out what directories to store the source in locally.
        Path outputDirectory = Paths.get(".", "test-data-random");
        Files.createDirectories(outputDirectory);
        String sourceDataSetId = Arrays.stream(sourceDataSet.getS3KeyPrefix().split("/")).reduce((a, b) -> b).get();
        Path sourceDataSetDirectory = outputDirectory.resolve(sourceDataSetId);
        // Download the source data set and build the target from it.
        ExtractionOptions options = new ExtractionOptions(sourceDataSet.getS3BucketName());
        String targetDataSetId = Arrays.stream(targetDataSet.getS3KeyPrefix().split("/")).reduce((a, b) -> b).get();
        Path targetDataSetDirectory = outputDirectory.resolve(targetDataSetId);
        Instant targetDataSetTimestamp = Instant.parse(targetDataSetId.replaceFirst("\\d+-beneficiaries-", ""));
        try (IDataSetWriter output = new LocalDataSetWriter(targetDataSetDirectory, targetDataSetTimestamp)) {
            Files.createDirectories(sourceDataSetDirectory);
            List<RifFile> rifFiles = downloadDataSet(options, sourceDataSetId, sourceDataSetDirectory);
            DataSetSubsetter.createSubset(output, targetBeneCount, rifFiles);
        }
    }
}
Also used : Path(java.nio.file.Path) Arrays(java.util.Arrays) CarrierClaimColumn(gov.cms.bfd.model.rif.CarrierClaimColumn) RifFileType(gov.cms.bfd.model.rif.RifFileType) S3RifFile(gov.cms.bfd.pipeline.ccw.rif.extract.s3.S3RifFile) S3Utilities(gov.cms.bfd.pipeline.ccw.rif.extract.s3.S3Utilities) LoggerFactory(org.slf4j.LoggerFactory) SNFClaimColumn(gov.cms.bfd.model.rif.SNFClaimColumn) HHAClaimColumn(gov.cms.bfd.model.rif.HHAClaimColumn) CSVFormat(org.apache.commons.csv.CSVFormat) Map(java.util.Map) CSVParser(org.apache.commons.csv.CSVParser) Path(java.nio.file.Path) TransferManagerBuilder(com.amazonaws.services.s3.transfer.TransferManagerBuilder) InpatientClaimColumn(gov.cms.bfd.model.rif.InpatientClaimColumn) Set(java.util.Set) RifFile(gov.cms.bfd.model.rif.RifFile) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) JAXBException(javax.xml.bind.JAXBException) UncheckedIOException(java.io.UncheckedIOException) List(java.util.List) UncheckedJaxbException(gov.cms.bfd.sharedutils.exceptions.UncheckedJaxbException) Entry(java.util.Map.Entry) RifParsingUtils(gov.cms.bfd.model.rif.parse.RifParsingUtils) AmazonClientException(com.amazonaws.AmazonClientException) BeneficiaryColumn(gov.cms.bfd.model.rif.BeneficiaryColumn) CSVPrinter(org.apache.commons.csv.CSVPrinter) TransferManager(com.amazonaws.services.s3.transfer.TransferManager) LocalRifFile(gov.cms.bfd.pipeline.ccw.rif.extract.LocalRifFile) OutpatientClaimColumn(gov.cms.bfd.model.rif.OutpatientClaimColumn) HospiceClaimColumn(gov.cms.bfd.model.rif.HospiceClaimColumn) Marshaller(javax.xml.bind.Marshaller) HashMap(java.util.HashMap) ExtractionOptions(gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions) Download(com.amazonaws.services.s3.transfer.Download) ArrayList(java.util.ArrayList) DataSetManifestEntry(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest.DataSetManifestEntry) HashSet(java.util.HashSet) TestDataSetLocation(gov.cms.bfd.model.rif.samples.TestDataSetLocation) AmazonS3(com.amazonaws.services.s3.AmazonS3) PartDEventColumn(gov.cms.bfd.model.rif.PartDEventColumn) JAXBContext(javax.xml.bind.JAXBContext) Unmarshaller(javax.xml.bind.Unmarshaller) Logger(org.slf4j.Logger) Files(java.nio.file.Files) FileWriter(java.io.FileWriter) IOException(java.io.IOException) DataSetManifest(gov.cms.bfd.pipeline.ccw.rif.extract.s3.DataSetManifest) Paths(java.nio.file.Paths) DMEClaimColumn(gov.cms.bfd.model.rif.DMEClaimColumn) Collections(java.util.Collections) S3RifFile(gov.cms.bfd.pipeline.ccw.rif.extract.s3.S3RifFile) RifFile(gov.cms.bfd.model.rif.RifFile) LocalRifFile(gov.cms.bfd.pipeline.ccw.rif.extract.LocalRifFile) TestDataSetLocation(gov.cms.bfd.model.rif.samples.TestDataSetLocation) Instant(java.time.Instant) ExtractionOptions(gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions)

Aggregations

AmazonClientException (com.amazonaws.AmazonClientException)1 AmazonS3 (com.amazonaws.services.s3.AmazonS3)1 Download (com.amazonaws.services.s3.transfer.Download)1 TransferManager (com.amazonaws.services.s3.transfer.TransferManager)1 TransferManagerBuilder (com.amazonaws.services.s3.transfer.TransferManagerBuilder)1 BeneficiaryColumn (gov.cms.bfd.model.rif.BeneficiaryColumn)1 CarrierClaimColumn (gov.cms.bfd.model.rif.CarrierClaimColumn)1 DMEClaimColumn (gov.cms.bfd.model.rif.DMEClaimColumn)1 HHAClaimColumn (gov.cms.bfd.model.rif.HHAClaimColumn)1 HospiceClaimColumn (gov.cms.bfd.model.rif.HospiceClaimColumn)1 InpatientClaimColumn (gov.cms.bfd.model.rif.InpatientClaimColumn)1 OutpatientClaimColumn (gov.cms.bfd.model.rif.OutpatientClaimColumn)1 PartDEventColumn (gov.cms.bfd.model.rif.PartDEventColumn)1 RifFile (gov.cms.bfd.model.rif.RifFile)1 RifFileType (gov.cms.bfd.model.rif.RifFileType)1 SNFClaimColumn (gov.cms.bfd.model.rif.SNFClaimColumn)1 RifParsingUtils (gov.cms.bfd.model.rif.parse.RifParsingUtils)1 TestDataSetLocation (gov.cms.bfd.model.rif.samples.TestDataSetLocation)1 ExtractionOptions (gov.cms.bfd.pipeline.ccw.rif.extract.ExtractionOptions)1 LocalRifFile (gov.cms.bfd.pipeline.ccw.rif.extract.LocalRifFile)1