use of gov.cms.bfd.model.rif.RifFile in project beneficiary-fhir-data by CMSgov.
the class DataSetSubsetter method downloadDataSet.
/**
* @param options the {@link ExtractionOptions} to use
* @param dataSetS3KeyPrefix the S3 key prefix (i.e. directory) of the data set to download
* @param downloadDirectory the Path to the directory to download the RIF files locally to
* @return the {@link S3RifFile}s that comprise the full 1M beneficiary dummy data set
*/
private static List<RifFile> downloadDataSet(ExtractionOptions options, String dataSetS3KeyPrefix, Path downloadDirectory) {
AmazonS3 s3Client = S3Utilities.createS3Client(options);
TransferManager transferManager = TransferManagerBuilder.standard().withS3Client(s3Client).build();
String dataSetPrefix = "data-random/" + dataSetS3KeyPrefix;
String manifestSuffix = "1_manifest.xml";
Path manifestDownloadPath = downloadDirectory.resolve(manifestSuffix);
if (!Files.exists(manifestDownloadPath)) {
String manifestKey = String.format("%s/%s", dataSetPrefix, manifestSuffix);
Download manifestDownload = transferManager.download(options.getS3BucketName(), manifestKey, manifestDownloadPath.toFile());
try {
manifestDownload.waitForCompletion();
} catch (AmazonClientException | InterruptedException e) {
throw new RuntimeException(e);
}
}
LOGGER.info("Manifest downloaded.");
DataSetManifest dummyDataSetManifest;
try {
JAXBContext jaxbContext = JAXBContext.newInstance(DataSetManifest.class);
Unmarshaller jaxbUnmarshaller = jaxbContext.createUnmarshaller();
dummyDataSetManifest = (DataSetManifest) jaxbUnmarshaller.unmarshal(manifestDownloadPath.toFile());
} catch (JAXBException e) {
throw new UncheckedJaxbException(e);
}
List<RifFile> rifFiles = new ArrayList<>();
for (DataSetManifestEntry manifestEntry : dummyDataSetManifest.getEntries()) {
String dataSetFileKey = String.format("%s/%s", dataSetPrefix, manifestEntry.getName());
Path dataSetFileDownloadPath = downloadDirectory.resolve(manifestEntry.getName());
if (!Files.exists(dataSetFileDownloadPath)) {
LOGGER.info("Downloading RIF file: '{}'...", manifestEntry.getName());
Download dataSetFileDownload = transferManager.download(options.getS3BucketName(), dataSetFileKey, dataSetFileDownloadPath.toFile());
try {
dataSetFileDownload.waitForCompletion();
} catch (AmazonClientException | InterruptedException e) {
throw new RuntimeException(e);
}
}
RifFile dataSetFile = new LocalRifFile(dataSetFileDownloadPath, manifestEntry.getType());
rifFiles.add(dataSetFile);
}
transferManager.shutdownNow();
LOGGER.info("Original RIF files ready.");
return rifFiles;
}
use of gov.cms.bfd.model.rif.RifFile in project beneficiary-fhir-data by CMSgov.
the class DataSetSubsetter method createSubset.
/**
* Creates a subset of the specified input {@link RifFile}s, writing out the results via the
* {@link CSVPrinter}s provided by the specified {@link IDataSetWriter}.
*
* @param output the {@link IDataSetWriter} to get the needed {@link CSVPrinter}s from
* @param beneficiaryCount the target beneficiary count of the copy/subset to create
* @param rifFiles the input {@link RifFile}s to be subsetted
* @throws IOException Any {@link IOException}s encountered will be bubbled up.
*/
public static void createSubset(IDataSetWriter output, int beneficiaryCount, List<RifFile> rifFiles) throws IOException {
LOGGER.info("Scanning beneficiary IDs...");
List<RifFile> beneficiaryFiles = rifFiles.stream().filter(f -> f.getFileType() == RifFileType.BENEFICIARY).collect(Collectors.toList());
List<String> beneficiaryIds = new ArrayList<>();
for (RifFile beneficiaryFile : beneficiaryFiles) {
CSVParser parser = RifParsingUtils.createCsvParser(beneficiaryFile);
parser.forEach(r -> {
String beneficiaryId = r.get(BeneficiaryColumn.BENE_ID);
if (beneficiaryIds.contains(beneficiaryId))
throw new IllegalStateException();
beneficiaryIds.add(beneficiaryId);
});
parser.close();
}
LOGGER.info("Scanned beneficiary IDs.");
Set<String> selectedBeneficiaryIds = new HashSet<>(beneficiaryCount);
Collections.shuffle(beneficiaryIds);
for (int i = 0; i < beneficiaryCount; i++) selectedBeneficiaryIds.add(beneficiaryIds.get(i));
LOGGER.info("Selected '{}' random beneficiary IDs.", beneficiaryCount);
Map<RifFileType, Enum<?>> beneficiaryColumnByFileType = new HashMap<>();
beneficiaryColumnByFileType.put(RifFileType.BENEFICIARY, BeneficiaryColumn.BENE_ID);
beneficiaryColumnByFileType.put(RifFileType.CARRIER, CarrierClaimColumn.BENE_ID);
beneficiaryColumnByFileType.put(RifFileType.DME, DMEClaimColumn.BENE_ID);
beneficiaryColumnByFileType.put(RifFileType.HHA, HHAClaimColumn.BENE_ID);
beneficiaryColumnByFileType.put(RifFileType.HOSPICE, HospiceClaimColumn.BENE_ID);
beneficiaryColumnByFileType.put(RifFileType.INPATIENT, InpatientClaimColumn.BENE_ID);
beneficiaryColumnByFileType.put(RifFileType.OUTPATIENT, OutpatientClaimColumn.BENE_ID);
beneficiaryColumnByFileType.put(RifFileType.PDE, PartDEventColumn.BENE_ID);
beneficiaryColumnByFileType.put(RifFileType.SNF, SNFClaimColumn.BENE_ID);
for (RifFile rifFile : rifFiles) {
LOGGER.info("Subsetting RIF file: '{}'...", rifFile.getDisplayName());
CSVPrinter rifFilePrinter = output.getPrinter(rifFile.getFileType());
CSVParser parser = RifParsingUtils.createCsvParser(rifFile);
/*
* When we created the CSVPrinter, we told it to skip the header.
* That ensures that we don't write out a header until we've started
* reading the file and know what it is. Here, we print a "fake"
* first record with the header, as read from the input file.
* Previously, we'd been having the CSVPrinter create a header based
* on our RIF column enums, but that leads to us propagating errors
* in those enums to the sample files. It's better to let the files
* tell us what their headers are.
*/
rifFilePrinter.printRecord(parser.getHeaderMap().entrySet().stream().sorted(Map.Entry.comparingByValue()).map(e -> e.getKey()).toArray());
parser.forEach(r -> {
String beneficiaryId = r.get(beneficiaryColumnByFileType.get(rifFile.getFileType()));
if (selectedBeneficiaryIds.contains(beneficiaryId))
try {
rifFilePrinter.printRecord(r);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
});
}
LOGGER.info("Subsetted all RIF files.");
}
use of gov.cms.bfd.model.rif.RifFile in project beneficiary-fhir-data by CMSgov.
the class RifFilesProcessor method produceRecords.
/**
* @param rifFileEvent the {@link RifFileEvent} that is being processed
* @return a {@link RifFileRecords} with the {@link RifRecordEvent}s produced from the specified
* {@link RifFileEvent}
*/
public RifFileRecords produceRecords(RifFileEvent rifFileEvent) {
RifFile file = rifFileEvent.getFile();
/*
* Approach used here to parse CSV as a Java 8 Stream is courtesy of
* https://rumianom.pl/rumianom/entry/apache-commons-csv-with-java.
*/
CSVParser parser = RifParsingUtils.createCsvParser(file);
boolean isGrouped;
BiFunction<RifFileEvent, List<CSVRecord>, RifRecordEvent<?>> recordParser;
if (file.getFileType() == RifFileType.BENEFICIARY) {
isGrouped = false;
recordParser = RifFilesProcessor::buildBeneficiaryEvent;
} else if (file.getFileType() == RifFileType.BENEFICIARY_HISTORY) {
isGrouped = false;
recordParser = RifFilesProcessor::buildBeneficiaryHistoryEvent;
} else if (file.getFileType() == RifFileType.MEDICARE_BENEFICIARY_ID_HISTORY) {
isGrouped = false;
recordParser = RifFilesProcessor::buildMedicareBeneficiaryIdHistoryEvent;
} else if (file.getFileType() == RifFileType.PDE) {
isGrouped = false;
recordParser = RifFilesProcessor::buildPartDEvent;
} else if (file.getFileType() == RifFileType.CARRIER) {
isGrouped = true;
recordParser = RifFilesProcessor::buildCarrierClaimEvent;
} else if (file.getFileType() == RifFileType.INPATIENT) {
isGrouped = true;
recordParser = RifFilesProcessor::buildInpatientClaimEvent;
} else if (file.getFileType() == RifFileType.OUTPATIENT) {
isGrouped = true;
recordParser = RifFilesProcessor::buildOutpatientClaimEvent;
} else if (file.getFileType() == RifFileType.SNF) {
isGrouped = true;
recordParser = RifFilesProcessor::buildSNFClaimEvent;
} else if (file.getFileType() == RifFileType.HOSPICE) {
isGrouped = true;
recordParser = RifFilesProcessor::buildHospiceClaimEvent;
} else if (file.getFileType() == RifFileType.HHA) {
isGrouped = true;
recordParser = RifFilesProcessor::buildHHAClaimEvent;
} else if (file.getFileType() == RifFileType.DME) {
isGrouped = true;
recordParser = RifFilesProcessor::buildDMEClaimEvent;
} else {
throw new UnsupportedRifFileTypeException("Unsupported file type:" + file.getFileType());
}
/*
* Use the CSVParser to drive a Stream of grouped CSVRecords
* (specifically, group by claim ID/lines).
*/
CsvRecordGrouper grouper = new ColumnValueCsvRecordGrouper(isGrouped ? file.getFileType().getIdColumn() : null);
Iterator<List<CSVRecord>> csvIterator = new CsvRecordGroupingIterator(parser, grouper);
Spliterator<List<CSVRecord>> spliterator = Spliterators.spliteratorUnknownSize(csvIterator, Spliterator.ORDERED | Spliterator.NONNULL);
Stream<List<CSVRecord>> csvRecordStream = StreamSupport.stream(spliterator, false).onClose(() -> {
try {
/*
* This will also close the Reader and InputStream that the
* CSVParser was consuming.
*/
parser.close();
} catch (IOException e) {
LOGGER.warn("Unable to close CSVParser", e);
}
});
/* Map each record group to a single RifRecordEvent. */
Stream<RifRecordEvent<?>> rifRecordStream = csvRecordStream.map(csvRecordGroup -> {
try {
Timer.Context parsingTimer = rifFileEvent.getEventMetrics().timer(MetricRegistry.name(getClass().getSimpleName(), "recordParsing")).time();
RifRecordEvent<?> recordEvent = recordParser.apply(rifFileEvent, csvRecordGroup);
parsingTimer.close();
return recordEvent;
} catch (InvalidRifValueException e) {
LOGGER.warn("Parse error encountered near line number '{}'.", csvRecordGroup.get(0).getRecordNumber());
throw new InvalidRifValueException(e);
}
});
return new RifFileRecords(rifFileEvent, rifRecordStream);
}
use of gov.cms.bfd.model.rif.RifFile in project beneficiary-fhir-data by CMSgov.
the class DataSetSubsetter method main.
/**
* The application entry point that can be used to run the {@link DataSetSubsetter}.
*
* @param args (not used)
* @throws Exception Any exceptions thrown will be bubbled up, terminating the app.
*/
public static void main(String[] args) throws Exception {
/*
* From the original source data set of 1M beneficiaries and their
* claims, create subsets going all the way down by powers of ten. This
* gives test authors lots of good options for how much data to test
* against. Note that on Karl's `jordan-u` system, this took 5.5h to
* run.
*/
for (int beneCount = 1000000; beneCount >= 10; beneCount /= 10) {
// Grab the source and target constants.
final int sourceBeneCount = beneCount;
final int targetBeneCount = beneCount / 10;
TestDataSetLocation sourceDataSet = Arrays.stream(TestDataSetLocation.class.getEnumConstants()).filter(c -> c.name().matches("DUMMY_DATA_" + sourceBeneCount + "_BENES")).findAny().get();
TestDataSetLocation targetDataSet = Arrays.stream(TestDataSetLocation.class.getEnumConstants()).filter(c -> c.name().matches("DUMMY_DATA_" + targetBeneCount + "_BENES")).findAny().get();
// Figure out what directories to store the source in locally.
Path outputDirectory = Paths.get(".", "test-data-random");
Files.createDirectories(outputDirectory);
String sourceDataSetId = Arrays.stream(sourceDataSet.getS3KeyPrefix().split("/")).reduce((a, b) -> b).get();
Path sourceDataSetDirectory = outputDirectory.resolve(sourceDataSetId);
// Download the source data set and build the target from it.
ExtractionOptions options = new ExtractionOptions(sourceDataSet.getS3BucketName());
String targetDataSetId = Arrays.stream(targetDataSet.getS3KeyPrefix().split("/")).reduce((a, b) -> b).get();
Path targetDataSetDirectory = outputDirectory.resolve(targetDataSetId);
Instant targetDataSetTimestamp = Instant.parse(targetDataSetId.replaceFirst("\\d+-beneficiaries-", ""));
try (IDataSetWriter output = new LocalDataSetWriter(targetDataSetDirectory, targetDataSetTimestamp)) {
Files.createDirectories(sourceDataSetDirectory);
List<RifFile> rifFiles = downloadDataSet(options, sourceDataSetId, sourceDataSetDirectory);
DataSetSubsetter.createSubset(output, targetBeneCount, rifFiles);
}
}
}
use of gov.cms.bfd.model.rif.RifFile in project beneficiary-fhir-data by CMSgov.
the class RifLoaderIT method failOnUpdateBeneficiaryBeforeInsert.
/**
* Runs {@link RifLoader} against the {@link StaticRifResourceGroup#SAMPLE_A} data for an <code>
* UPDATE</code> {@link Beneficiary} record that there hasn't been a previous <code>INSERT</code>
* on, to verify that this fails as expected.
*/
@Test
public void failOnUpdateBeneficiaryBeforeInsert() {
// Tweak the SAMPLE_A beneficiary to be an UPDATE.
Stream<RifFile> samplesStream = filterSamples(r -> r.getFileType() == RifFileType.BENEFICIARY, StaticRifResourceGroup.SAMPLE_A.getResources());
Function<RifRecordEvent<?>, List<List<String>>> recordEditor = rifRecordEvent -> {
CSVRecord beneCsvRow = rifRecordEvent.getRawCsvRecords().get(0);
List<String> beneCsvValues = StreamSupport.stream(beneCsvRow.spliterator(), false).collect(Collectors.toList());
beneCsvValues.set(0, "UPDATE");
return List.of(beneCsvValues);
};
Function<RifFile, RifFile> fileEditor = sample -> editSampleRecords(sample, recordEditor);
Stream<RifFile> editedSample = editSamples(samplesStream, fileEditor);
// Load the edited sample to verify that it fails, as expected.
AssertionFailedError thrown = assertThrows(AssertionFailedError.class, () -> {
loadSample("SAMPLE_A, bene only, UPDATE", CcwRifLoadTestUtils.getLoadOptions(), editedSample);
});
assertTrue(thrown.getMessage().contains("Load errors encountered"));
}
Aggregations