use of gov.cms.bfd.pipeline.ccw.rif.extract.LocalRifFile in project beneficiary-fhir-data by CMSgov.
the class DataSetSubsetter method downloadDataSet.
/**
* @param options the {@link ExtractionOptions} to use
* @param dataSetS3KeyPrefix the S3 key prefix (i.e. directory) of the data set to download
* @param downloadDirectory the Path to the directory to download the RIF files locally to
* @return the {@link S3RifFile}s that comprise the full 1M beneficiary dummy data set
*/
private static List<RifFile> downloadDataSet(ExtractionOptions options, String dataSetS3KeyPrefix, Path downloadDirectory) {
AmazonS3 s3Client = S3Utilities.createS3Client(options);
TransferManager transferManager = TransferManagerBuilder.standard().withS3Client(s3Client).build();
String dataSetPrefix = "data-random/" + dataSetS3KeyPrefix;
String manifestSuffix = "1_manifest.xml";
Path manifestDownloadPath = downloadDirectory.resolve(manifestSuffix);
if (!Files.exists(manifestDownloadPath)) {
String manifestKey = String.format("%s/%s", dataSetPrefix, manifestSuffix);
Download manifestDownload = transferManager.download(options.getS3BucketName(), manifestKey, manifestDownloadPath.toFile());
try {
manifestDownload.waitForCompletion();
} catch (AmazonClientException | InterruptedException e) {
throw new RuntimeException(e);
}
}
LOGGER.info("Manifest downloaded.");
DataSetManifest dummyDataSetManifest;
try {
JAXBContext jaxbContext = JAXBContext.newInstance(DataSetManifest.class);
Unmarshaller jaxbUnmarshaller = jaxbContext.createUnmarshaller();
dummyDataSetManifest = (DataSetManifest) jaxbUnmarshaller.unmarshal(manifestDownloadPath.toFile());
} catch (JAXBException e) {
throw new UncheckedJaxbException(e);
}
List<RifFile> rifFiles = new ArrayList<>();
for (DataSetManifestEntry manifestEntry : dummyDataSetManifest.getEntries()) {
String dataSetFileKey = String.format("%s/%s", dataSetPrefix, manifestEntry.getName());
Path dataSetFileDownloadPath = downloadDirectory.resolve(manifestEntry.getName());
if (!Files.exists(dataSetFileDownloadPath)) {
LOGGER.info("Downloading RIF file: '{}'...", manifestEntry.getName());
Download dataSetFileDownload = transferManager.download(options.getS3BucketName(), dataSetFileKey, dataSetFileDownloadPath.toFile());
try {
dataSetFileDownload.waitForCompletion();
} catch (AmazonClientException | InterruptedException e) {
throw new RuntimeException(e);
}
}
RifFile dataSetFile = new LocalRifFile(dataSetFileDownloadPath, manifestEntry.getType());
rifFiles.add(dataSetFile);
}
transferManager.shutdownNow();
LOGGER.info("Original RIF files ready.");
return rifFiles;
}
use of gov.cms.bfd.pipeline.ccw.rif.extract.LocalRifFile in project beneficiary-fhir-data by CMSgov.
the class SyntheticDataFixer method fixBeneficiaryFile.
/**
* Process the original RIF file for the specified {@link SyntheticDataFile} , then write out a
* fixed version of the file.
*
* @param hicnPool the {@link Deque} of HICNs to select from, which will be depleted as rows are
* written
* @param syntheticDataFile the beneficiary {@link SyntheticDataFile} to be fixed
* @throws IOException (Any {@link IOException}s encountered will be bubbled up.
*/
private static void fixBeneficiaryFile(Deque<String> hicnPool, SyntheticDataFile syntheticDataFile) {
LocalRifFile rifFile = syntheticDataFile.getRifFile();
CSVParser parser = RifParsingUtils.createCsvParser(rifFile);
LOGGER.info("Fixing RIF file: '{}'...", rifFile.getDisplayName());
/*
* We tell the CSVPrinter not to include a header here, because we will
* manually add it later, based on what we find in the input file.
*/
CSVFormat csvFormat = RifParsingUtils.CSV_FORMAT.withHeader((String[]) null);
try (FileWriter writer = new FileWriter(syntheticDataFile.getFixedFilePath().toFile());
CSVPrinter rifFilePrinter = new CSVPrinter(writer, csvFormat)) {
/*
* When we created the CSVPrinter, we told it to skip the header.
* That ensures that we don't write out a header until we've started
* reading the file and know what it is. Before proceeding, we
* verify that the header is what we expect it to be, to avoid
* propagating errors in our code.
*/
Object[] columnNamesFromFile = parser.getHeaderMap().entrySet().stream().sorted(Map.Entry.comparingByValue()).map(e -> e.getKey()).toArray();
Object[] columnNamesFromFileWithoutMetadata = parser.getHeaderMap().entrySet().stream().sorted(Map.Entry.comparingByValue()).map(e -> e.getKey()).filter(c -> !c.equals("DML_IND")).toArray();
Object[] columnNamesFromEnum = Arrays.stream(BeneficiaryColumn.values()).map(c -> c.name()).toArray();
if (!Arrays.equals(columnNamesFromFileWithoutMetadata, columnNamesFromEnum))
throw new IllegalStateException(String.format("Column names mismatch:\nColumns from enum: %s\nColumns from file: %s", Arrays.toString(columnNamesFromEnum), Arrays.toString(columnNamesFromFileWithoutMetadata)));
rifFilePrinter.printRecord(columnNamesFromFile);
parser.forEach(r -> {
// Read the record into a List.
List<String> recordValues = new LinkedList<>();
for (String value : r) recordValues.add(value);
// Fix the not-random-enough HICNs.
recordValues.set(BeneficiaryColumn.BENE_CRNT_HIC_NUM.ordinal() + 1, hicnPool.pop());
// Fix the incorrectly formatted dates.
fixDateFormatting(recordValues, BeneficiaryColumn.BENE_BIRTH_DT);
try {
rifFilePrinter.printRecord(recordValues);
} catch (Exception e) {
throw new IllegalStateException(e);
}
});
} catch (IOException e) {
throw new IllegalStateException(e);
}
LOGGER.info("Fixed RIF file: '{}'...", syntheticDataFile.getFixedFilePath());
}
use of gov.cms.bfd.pipeline.ccw.rif.extract.LocalRifFile in project beneficiary-fhir-data by CMSgov.
the class SyntheticDataFixer method fixCarrierFile.
/**
* Process the original RIF file for the specified {@link SyntheticDataFile} , then write out a
* fixed version of the file.
*
* @param syntheticDataFile the beneficiary {@link SyntheticDataFile} to be fixed
* @param carrierClaimLineIds the {@link Set} of already-encountered <code>CLM_ID:LINE_NUM</code>
* pairs, which will be used to skip dupes
* @throws IOException (Any {@link IOException}s encountered will be bubbled up.
*/
private static void fixCarrierFile(SyntheticDataFile syntheticDataFile, Set<String> carrierClaimLineIds) {
LocalRifFile rifFile = syntheticDataFile.getRifFile();
CSVParser parser = RifParsingUtils.createCsvParser(rifFile);
LOGGER.info("Fixing RIF file: '{}'...", rifFile.getDisplayName());
/*
* We tell the CSVPrinter not to include a header here, because we will
* manually add it later, based on what we find in the input file.
*/
CSVFormat csvFormat = RifParsingUtils.CSV_FORMAT.withHeader((String[]) null);
try (FileWriter writer = new FileWriter(syntheticDataFile.getFixedFilePath().toFile());
CSVPrinter rifFilePrinter = new CSVPrinter(writer, csvFormat)) {
/*
* When we created the CSVPrinter, we told it to skip the header.
* That ensures that we don't write out a header until we've started
* reading the file and know what it is. Before proceeding, we
* verify that the header is what we expect it to be, to avoid
* propagating errors in our code.
*/
Object[] columnNamesFromFile = parser.getHeaderMap().entrySet().stream().sorted(Map.Entry.comparingByValue()).map(e -> e.getKey()).toArray();
Object[] columnNamesFromFileWithoutMetadata = parser.getHeaderMap().entrySet().stream().sorted(Map.Entry.comparingByValue()).map(e -> e.getKey()).filter(c -> !c.equals("DML_IND")).toArray();
Object[] columnNamesFromEnum = Arrays.stream(CarrierClaimColumn.values()).map(c -> c.name()).toArray();
if (!Arrays.equals(columnNamesFromFileWithoutMetadata, columnNamesFromEnum))
throw new IllegalStateException(String.format("Column names mismatch:\nColumns from enum: %s\nColumns from file: %s", Arrays.toString(columnNamesFromEnum), Arrays.toString(columnNamesFromFileWithoutMetadata)));
rifFilePrinter.printRecord(columnNamesFromFile);
parser.forEach(r -> {
// Read the record into a List.
List<String> recordValues = new LinkedList<>();
for (String value : r) recordValues.add(value);
// Skip dupe PKs.
String carrierClaimLineId = String.format("%s:%s", recordValues.get(CarrierClaimColumn.CLM_ID.ordinal() + 1), recordValues.get(CarrierClaimColumn.LINE_NUM.ordinal() + 1));
if (carrierClaimLineIds.contains(carrierClaimLineId))
return;
carrierClaimLineIds.add(carrierClaimLineId);
// Fix the incorrectly formatted dates.
fixDateFormatting(recordValues, CarrierClaimColumn.CLM_FROM_DT);
fixDateFormatting(recordValues, CarrierClaimColumn.CLM_THRU_DT);
fixDateFormatting(recordValues, CarrierClaimColumn.NCH_WKLY_PROC_DT);
fixDateFormatting(recordValues, CarrierClaimColumn.LINE_1ST_EXPNS_DT);
fixDateFormatting(recordValues, CarrierClaimColumn.LINE_LAST_EXPNS_DT);
// Fix the incorrectly formatted numbers.
fixNumberFormatting(recordValues, CarrierClaimColumn.CLM_PMT_AMT);
fixNumberFormatting(recordValues, CarrierClaimColumn.CARR_CLM_PRMRY_PYR_PD_AMT);
fixNumberFormatting(recordValues, CarrierClaimColumn.NCH_CLM_PRVDR_PMT_AMT);
fixNumberFormatting(recordValues, CarrierClaimColumn.NCH_CLM_BENE_PMT_AMT);
fixNumberFormatting(recordValues, CarrierClaimColumn.NCH_CARR_CLM_SBMTD_CHRG_AMT);
fixNumberFormatting(recordValues, CarrierClaimColumn.NCH_CARR_CLM_ALOWD_AMT);
fixNumberFormatting(recordValues, CarrierClaimColumn.CARR_CLM_CASH_DDCTBL_APLD_AMT);
fixNumberFormatting(recordValues, CarrierClaimColumn.LINE_SRVC_CNT);
fixNumberFormatting(recordValues, CarrierClaimColumn.LINE_NCH_PMT_AMT);
fixNumberFormatting(recordValues, CarrierClaimColumn.LINE_BENE_PMT_AMT);
fixNumberFormatting(recordValues, CarrierClaimColumn.LINE_PRVDR_PMT_AMT);
fixNumberFormatting(recordValues, CarrierClaimColumn.LINE_BENE_PTB_DDCTBL_AMT);
fixNumberFormatting(recordValues, CarrierClaimColumn.LINE_BENE_PRMRY_PYR_PD_AMT);
fixNumberFormatting(recordValues, CarrierClaimColumn.LINE_COINSRNC_AMT);
fixNumberFormatting(recordValues, CarrierClaimColumn.LINE_SBMTD_CHRG_AMT);
fixNumberFormatting(recordValues, CarrierClaimColumn.LINE_ALOWD_CHRG_AMT);
fixNumberFormatting(recordValues, CarrierClaimColumn.CARR_LINE_MTUS_CNT);
fixNumberFormatting(recordValues, CarrierClaimColumn.LINE_HCT_HGB_RSLT_NUM);
fixNumberFormatting(recordValues, CarrierClaimColumn.CARR_LINE_ANSTHSA_UNIT_CNT);
// Fix the unexpected null values.
replaceNullValue(recordValues, CarrierClaimColumn.CARR_LINE_PRVDR_TYPE_CD, "0");
try {
rifFilePrinter.printRecord(recordValues);
} catch (Exception e) {
throw new IllegalStateException(e);
}
});
} catch (IOException e) {
throw new IllegalStateException(e);
}
LOGGER.info("Fixed RIF file: '{}'...", syntheticDataFile.getFixedFilePath());
}
use of gov.cms.bfd.pipeline.ccw.rif.extract.LocalRifFile in project beneficiary-fhir-data by CMSgov.
the class SyntheticDataFixer2 method fixPartDEventsFile.
/**
* Process the original RIF file for the specified {@link SyntheticDataFile}, then write out a
* fixed version of the file.
*
* @param syntheticDataFile the beneficiary {@link SyntheticDataFile} to be fixed
* @throws IOException (Any {@link IOException}s encountered will be bubbled up.
*/
private static void fixPartDEventsFile(SyntheticDataFile syntheticDataFile) {
LocalRifFile rifFile = syntheticDataFile.getRifFile();
CSVParser parser = RifParsingUtils.createCsvParser(rifFile);
LOGGER.info("Fixing RIF file: '{}'...", rifFile.getDisplayName());
/*
* We tell the CSVPrinter not to include a header here, because we will manually
* add it later, based on what we find in the input file.
*/
CSVFormat csvFormat = RifParsingUtils.CSV_FORMAT.withHeader((String[]) null);
try (FileWriter writer = new FileWriter(syntheticDataFile.getFixedFilePath().toFile());
CSVPrinter rifFilePrinter = new CSVPrinter(writer, csvFormat)) {
/*
* When we created the CSVPrinter, we told it to skip the header. That ensures
* that we don't write out a header until we've started reading the file and
* know what it is. Before proceeding, we verify that the header is what we
* expect it to be, to avoid propagating errors in our code.
*/
Object[] columnNamesFromFile = parser.getHeaderMap().entrySet().stream().sorted(Map.Entry.comparingByValue()).map(e -> e.getKey()).toArray();
Object[] columnNamesFromFileWithoutMetadata = parser.getHeaderMap().entrySet().stream().sorted(Map.Entry.comparingByValue()).map(e -> e.getKey()).filter(c -> !c.equals("DML_IND")).toArray();
Object[] columnNamesFromEnum = Arrays.stream(PartDEventColumn.values()).map(c -> c.name()).toArray();
if (!Arrays.equals(columnNamesFromFileWithoutMetadata, columnNamesFromEnum))
throw new IllegalStateException(String.format("Column names mismatch:\nColumns from enum: %s\nColumns from file: %s", Arrays.toString(columnNamesFromEnum), Arrays.toString(columnNamesFromFileWithoutMetadata)));
rifFilePrinter.printRecord(columnNamesFromFile);
parser.forEach(r -> {
// Read the record into a List.
List<String> recordValues = new LinkedList<>();
for (String value : r) recordValues.add(value);
// Make the PDE_ID negative.
makeColumnNegative(recordValues, PartDEventColumn.PDE_ID);
// Make the BENE_ID negative.
makeColumnNegative(recordValues, PartDEventColumn.BENE_ID);
try {
rifFilePrinter.printRecord(recordValues);
} catch (Exception e) {
throw new IllegalStateException(e);
}
});
} catch (IOException e) {
throw new IllegalStateException(e);
}
LOGGER.info("Fixed RIF file: '{}'...", syntheticDataFile.getFixedFilePath());
}
use of gov.cms.bfd.pipeline.ccw.rif.extract.LocalRifFile in project beneficiary-fhir-data by CMSgov.
the class SyntheticDataFixer2 method fixBeneficiaryFile.
/**
* Process the original RIF file for the specified {@link SyntheticDataFile}, then write out a
* fixed version of the file.
*
* @param syntheticDataFile the beneficiary {@link SyntheticDataFile} to be fixed
* @throws IOException (Any {@link IOException}s encountered will be bubbled up.
*/
private static void fixBeneficiaryFile(SyntheticDataFile syntheticDataFile) {
LocalRifFile rifFile = syntheticDataFile.getRifFile();
CSVParser parser = RifParsingUtils.createCsvParser(rifFile);
LOGGER.info("Fixing RIF file: '{}'...", rifFile.getDisplayName());
/*
* We tell the CSVPrinter not to include a header here, because we will manually
* add it later, based on what we find in the input file.
*/
CSVFormat csvFormat = RifParsingUtils.CSV_FORMAT.withHeader((String[]) null);
try (FileWriter writer = new FileWriter(syntheticDataFile.getFixedFilePath().toFile());
CSVPrinter rifFilePrinter = new CSVPrinter(writer, csvFormat)) {
/*
* When we created the CSVPrinter, we told it to skip the header. That ensures
* that we don't write out a header until we've started reading the file and
* know what it is. Before proceeding, we verify that the header is what we
* expect it to be, to avoid propagating errors in our code.
*/
Object[] columnNamesFromFile = parser.getHeaderMap().entrySet().stream().sorted(Map.Entry.comparingByValue()).map(e -> e.getKey()).toArray();
Object[] columnNamesFromFileWithoutMetadata = parser.getHeaderMap().entrySet().stream().sorted(Map.Entry.comparingByValue()).map(e -> e.getKey()).filter(c -> !c.equals("DML_IND")).toArray();
Object[] columnNamesFromEnum = Arrays.stream(BeneficiaryColumn.values()).map(c -> c.name()).toArray();
if (!Arrays.equals(columnNamesFromFileWithoutMetadata, columnNamesFromEnum))
throw new IllegalStateException(String.format("Column names mismatch:\nColumns from enum: %s\nColumns from file: %s", Arrays.toString(columnNamesFromEnum), Arrays.toString(columnNamesFromFileWithoutMetadata)));
rifFilePrinter.printRecord(columnNamesFromFile);
parser.forEach(r -> {
// Read the record into a List.
List<String> recordValues = new LinkedList<>();
for (String value : r) recordValues.add(value);
// Make the BENE_ID negative.
makeColumnNegative(recordValues, BeneficiaryColumn.BENE_ID);
try {
rifFilePrinter.printRecord(recordValues);
} catch (Exception e) {
throw new IllegalStateException(e);
}
});
} catch (IOException e) {
throw new IllegalStateException(e);
}
LOGGER.info("Fixed RIF file: '{}'...", syntheticDataFile.getFixedFilePath());
}
Aggregations