use of gov.cms.bfd.model.codebook.model.Value in project beneficiary-fhir-data by CMSgov.
the class PdfParserTest method parseCodebookPdf_CARR_LINE_PRVDR_TYPE_CD.
/**
* Tests {@link
* gov.cms.bfd.model.codebook.extractor.PdfParser#parseCodebookPdf(SupportedCodebook)} against
* {@link gov.cms.bfd.model.codebook.extractor.SupportedCodebook#FFS_CLAIMS} for the <code>
* CARR_LINE_PRVDR_TYPE_CD</code> variable.
*
* @throws IOException Indicates test error.
*/
@Test
public void parseCodebookPdf_CARR_LINE_PRVDR_TYPE_CD() throws IOException {
/*
* Why are we spot checking this one variable's parsed output? Because it's
* somewhat interesting: 1) it has multiple valueGroups, 2) many of its values
* have multiple lines, 3) many of its coded values are duplicated, 4) it has a
* COMMENT that's just "-".
*/
Codebook codebook = PdfParser.parseCodebookPdf(SupportedCodebook.FFS_CLAIMS);
Variable variable = codebook.getVariables().stream().filter(v -> v.getId().equals("CARR_LINE_PRVDR_TYPE_CD")).findAny().get();
String expectedDescription1 = "Code identifying the type of provider furnishing the service for this line" + " item on the carrier claim.";
assertEquals("Carrier Line Provider Type Code", variable.getLabel());
assertParagraphsEquals(Arrays.asList(expectedDescription1), variable.getDescription());
assertEquals("PRV_TYPE", variable.getShortName().get());
assertEquals("CARR_LINE_PRVDR_TYPE_CD", variable.getLongName());
assertEquals(VariableType.CHAR, variable.getType().get());
assertEquals(new Integer(1), variable.getLength());
assertEquals("NCH", variable.getSource().get());
assertFalse(variable.getValueFormat().isPresent());
assertEquals(2, variable.getValueGroups().get().size());
assertEquals(8, variable.getValueGroups().get().get(0).getValues().size());
assertParagraphsEquals(Arrays.asList("For Physician/Supplier Claims:"), variable.getValueGroups().get().get(0).getDescription());
assertEquals(9, variable.getValueGroups().get().get(1).getValues().size());
assertParagraphsEquals(Arrays.asList("NOTE: PRIOR TO VERSION H, DME claims also used this code; the" + " following were valid code VALUES:"), variable.getValueGroups().get().get(1).getDescription());
// Spot-check some of the values:
Value value_0_3 = variable.getValueGroups().get().get(0).getValues().get(3);
assertEquals("3", value_0_3.getCode());
assertEquals("Institutional provider", value_0_3.getDescription());
Value value_1_8 = variable.getValueGroups().get().get(1).getValues().get(8);
assertEquals("8", value_1_8.getCode());
assertEquals("Other entities for whom EI numbers are used in coding the ID field or proprietorship" + " for whom EI numbers are used in coding the ID field.", value_1_8.getDescription());
assertFalse(variable.getComment().isPresent());
}
use of gov.cms.bfd.model.codebook.model.Value in project beneficiary-fhir-data by CMSgov.
the class PdfParserTest method findVariableSections.
/**
* Tests {@link gov.cms.bfd.model.codebook.extractor.PdfParser#findVariableSections(List)} against
* all {@link gov.cms.bfd.model.codebook.extractor.SupportedCodebook}s.
*
* @throws IOException Indicates test error.
*/
@Test
public void findVariableSections() throws IOException {
for (SupportedCodebook supportedCodebook : SupportedCodebook.values()) {
try (InputStream codebookPdfStream = supportedCodebook.getCodebookPdfInputStream()) {
LOGGER.info("Looking for sections in codebook: {}", supportedCodebook.name());
/*
* Note: We leave the printXXX(...) calls here disabled unless/until they're
* needed to debug a specific problem, as they add a ton of log noise.
*/
List<String> codebookTextLines = PdfParser.extractTextLinesFromPdf(codebookPdfStream);
// printTextLinesToConsole(codebookTextLines);
List<List<String>> variableSections = PdfParser.findVariableSections(codebookTextLines);
for (List<String> variableSection : variableSections) {
assertNotNull(variableSection);
assertTrue(variableSection.size() >= 10);
}
/*
* How else can you verify that the section splitting code worked correctly?
* Pick a one-line field that should have a unique value in each section, find
* all instances of that field in the un-grouped lines, then make sure that each
* one of those unique field lines can be found in a section.
*/
Predicate<? super String> searchFieldFilter = l -> l.startsWith("SHORT_NAME:");
List<String> searchFieldLines = codebookTextLines.stream().filter(searchFieldFilter).collect(Collectors.toList());
// If this fails, we need to pick a different search field.
assertEquals(searchFieldLines.size(), new HashSet<>(searchFieldLines).size(), "Not all instances of that field are unique.");
for (String searchFieldLine : searchFieldLines) {
boolean foundSection = false;
for (List<String> variableSection : variableSections) {
for (String line : variableSection) if (searchFieldLine.equals(line))
foundSection = true;
}
assertTrue(foundSection, String.format("Can't find search field line: '%s'", searchFieldLine));
}
}
}
}
use of gov.cms.bfd.model.codebook.model.Value in project beneficiary-fhir-data by CMSgov.
the class SupportedCodebookTest method findDuplicateCodes.
/**
* Parses all of the {@link gov.cms.bfd.model.codebook.extractor.SupportedCodebook}s using {@link
* gov.cms.bfd.model.codebook.extractor.PdfParser}, looking for duplicate {@link Value#getCode()}s
* within each {@link Variable}.
*
* @throws IOException Indicates test error.
*/
@Test
public void findDuplicateCodes() throws IOException {
for (SupportedCodebook supportedCodebook : SupportedCodebook.values()) {
Codebook codebook = PdfParser.parseCodebookPdf(supportedCodebook);
for (Variable variable : codebook.getVariables()) {
if (!variable.getValueGroups().isPresent())
continue;
// Build a multimap of all the Values by their codes.
Map<String, List<Value>> valuesByCode = new LinkedHashMap<>();
for (ValueGroup valueGroup : variable.getValueGroups().get()) {
for (Value value : valueGroup.getValues()) {
if (!valuesByCode.containsKey(value.getCode()))
valuesByCode.put(value.getCode(), new ArrayList<>());
valuesByCode.get(value.getCode()).add(value);
}
}
// Find all of the codes that appear in more than one Value.
List<String> duplicatedCodes = new ArrayList<>();
for (String code : valuesByCode.keySet()) {
List<Value> values = valuesByCode.get(code);
if (values.size() > 1)
duplicatedCodes.add(code);
}
// Log a detailed warning for each duplicate.
for (String duplicatedCode : duplicatedCodes) {
List<Value> duplicatedValues = valuesByCode.get(duplicatedCode);
LOGGER.warn("The code '{}' appears more than once in Variable '{}': {}.", duplicatedCode, variable, duplicatedValues);
}
}
}
}
use of gov.cms.bfd.model.codebook.model.Value in project beneficiary-fhir-data by CMSgov.
the class PdfParser method parseValueGroups.
/**
* @param variableSection the variable section to parse the value from
* @return the {@link Variable#getValueGroups()} value from the specified {@link Variable} raw
* text section, or <code>null</code> if it was not present
*/
private static List<ValueGroup> parseValueGroups(List<String> variableSection) {
/*
* The parsing strategy here is basically this: 1) each Variable has EITHER a
* valueFormat or valueGroups, 2) if the field value includes at least one
* "XX = YY" line, it's a valueGroups, 3) otherwise (if it doesn't contain a
* code list), it's a valueFormat.
*/
String variableId = parseId(variableSection);
List<String> fieldLines = extractFieldContent(variableSection, FIELD_NAME_VALUES, FIELD_NAME_VALUES_ALT1);
if (fieldLines == null)
throw new IllegalStateException(String.format("Invalid '%s' field in variable section: %s", FIELD_NAME_VALUES, variableSection));
// Does this field have a an "XX = YY" coded value?
boolean foundCodedValue = false;
for (String line : variableSection) {
if (PATTERN_VALUE_LINE_WITH_CODE.matcher(line).matches())
foundCodedValue = true;
}
if (!foundCodedValue)
return null;
/*
* Now we know we're dealing with coded values, so we need to parse those.
*/
List<ValueGroup> valueGroups = new ArrayList<>();
ValueGroup currentValueGroup = new ValueGroup();
List<String> currentValueGroupDescription = new ArrayList<>();
List<String> currentValue = new ArrayList<>();
for (int fieldLineIndex = 0; fieldLineIndex < fieldLines.size(); fieldLineIndex++) {
String fieldLine = fieldLines.get(fieldLineIndex);
if (isValueGroupDescription(variableId, fieldLines, fieldLineIndex)) {
if (!currentValue.isEmpty()) {
// FYI: We just ended a Value that needs to be collected.
Value completedValue = parseValue(currentValue);
currentValueGroup.getValues().add(completedValue);
currentValue = new ArrayList<>();
// FYI: We also just ended a ValueGroup that needs to be collected.
valueGroups.add(currentValueGroup);
currentValueGroup = new ValueGroup();
}
// Regardless of what else is happening, always collect the line.
currentValueGroupDescription.add(fieldLine);
} else {
// FYI: We're in a Value.
boolean isLineStartOfValue = PATTERN_VALUE_LINE_WITH_CODE.matcher(fieldLine).matches();
if (!currentValueGroupDescription.isEmpty()) {
// FYI: We just ended a ValueGroup description that needs to be collected.
List<String> valueGroupDescriptionParagraphs = extractParagraphs(currentValueGroupDescription);
currentValueGroup.setDescription(valueGroupDescriptionParagraphs);
currentValueGroupDescription = new ArrayList<>();
}
if (isLineStartOfValue && !currentValue.isEmpty()) {
// FYI: We're starting a new Value and need to collect the previous one.
Value completedValue = parseValue(currentValue);
currentValueGroup.getValues().add(completedValue);
currentValue = new ArrayList<>();
}
// Regardless of what else is happening, always collect the line.
currentValue.add(fieldLine);
}
}
if (!currentValue.isEmpty()) {
// FYI: We're through all lines but need to collect the last Value.
Value completedValue = parseValue(currentValue);
currentValueGroup.getValues().add(completedValue);
// FYI: We also just ended a ValueGroup that needs to be collected.
valueGroups.add(currentValueGroup);
}
// Sanity check: make sure we don't have any leftovers.
if (!currentValueGroupDescription.isEmpty())
throw new BadCodeMonkeyException();
return valueGroups;
}
use of gov.cms.bfd.model.codebook.model.Value in project beneficiary-fhir-data by CMSgov.
the class PdfParser method parseValue.
/**
* @param valueLines the lines of text representing a {@link Value} to be parsed
* @return the {@link Value} parsed from those lines
*/
private static Value parseValue(List<String> valueLines) {
// Copy the list so we can bang on it safely.
List<String> valueLinesCopy = new ArrayList<>(valueLines);
// Parse the first line.
Matcher valueStartMatcher = PATTERN_VALUE_LINE_WITH_CODE.matcher(valueLinesCopy.get(0));
valueStartMatcher.matches();
// Grab the code from the first line.
String code = valueStartMatcher.group(1);
// Strip out the "XX = " prefix from the first line.
valueLinesCopy.set(0, valueStartMatcher.group(2));
// Convert it all to "paragraphs" to undo the line wrapping.
valueLinesCopy = extractParagraphs(valueLinesCopy);
// Just in case multiple paragraphs were found, glue them back together.
StringBuilder description = new StringBuilder();
for (Iterator<String> paragraphsIter = valueLinesCopy.iterator(); paragraphsIter.hasNext(); ) {
description.append(paragraphsIter.next());
if (paragraphsIter.hasNext())
description.append(' ');
}
Value value = new Value();
value.setCode(code);
value.setDescription(description.toString());
return value;
}
Aggregations