Examples with VariableCategory - edu.harvard.iq.dataverse.datavariable.VariableCategory

Example 6 with VariableCategory

use of edu.harvard.iq.dataverse.datavariable.VariableCategory in project dataverse by IQSS.

the class SAVFileReader method decodeRecordType2.

void decodeRecordType2(BufferedInputStream stream) throws IOException {
    dbgLog.fine("decodeRecordType2(): start");
    if (stream == null) {
        throw new IllegalArgumentException("stream == null!");
    }
    Map<String, String> printFormatNameTable = new LinkedHashMap<String, String>();
    Map<String, String> variableLabelMap = new LinkedHashMap<String, String>();
    Map<String, List<String>> missingValueTable = new LinkedHashMap<String, List<String>>();
    List<Integer> printFormatList = new ArrayList<Integer>();
    String caseWeightVariableName = null;
    int caseWeightVariableIndex = 0;
    boolean lastVariableIsExtendable = false;
    boolean extendedVariableMode = false;
    boolean obs255 = false;
    String lastVariableName = null;
    String lastExtendedVariable = null;
    // this field repeats as many as the number of variables in
    // this sav file
    // (note that the above statement is not technically correct, this
    // record repeats not just for every variable in the file, but for
    // every OBS (8 byte unit); i.e., if a string is split into multiple
    // OBS units, each one will have its own RT2 record -- L.A.).
    // Each field constists of a fixed (32-byte) segment and
    // then a few variable segments:
    // if the variable has a label (3rd INT4 set to 1), then there's 4 more
    // bytes specifying the length of the label, and then that many bytes
    // holding the label itself (no more than 256).
    // Then if there are optional missing value units (4th INT4 set to 1)
    // there will be 3 more OBS units attached = 24 extra bytes.
    int variableCounter = 0;
    int obsSeqNumber = 0;
    int j;
    dbgLog.fine("RT2: Reading " + OBSUnitsPerCase + " OBS units.");
    for (j = 0; j < OBSUnitsPerCase; j++) {
        dbgLog.fine("RT2: " + j + "-th RT2 unit is being decoded.");
        // 2.0: read the fixed[=non-optional] 32-byte segment
        byte[] recordType2Fixed = new byte[LENGTH_RECORDTYPE2_FIXED];
        try {
            int nbytes = stream.read(recordType2Fixed, 0, LENGTH_RECORDTYPE2_FIXED);
            if (nbytes == 0) {
                throw new IOException("reading recordType2: no bytes read!");
            }
            int offset = 0;
            // 2.1: create int-view of the bytebuffer for the first 16-byte segment
            int rt2_1st_4_units = 4;
            ByteBuffer[] bb_record_type2_fixed_part1 = new ByteBuffer[rt2_1st_4_units];
            int[] recordType2FixedPart1 = new int[rt2_1st_4_units];
            for (int i = 0; i < rt2_1st_4_units; i++) {
                bb_record_type2_fixed_part1[i] = ByteBuffer.wrap(recordType2Fixed, offset, LENGTH_SAV_INT_BLOCK);
                offset += LENGTH_SAV_INT_BLOCK;
                if (isLittleEndian) {
                    bb_record_type2_fixed_part1[i].order(ByteOrder.LITTLE_ENDIAN);
                }
                recordType2FixedPart1[i] = bb_record_type2_fixed_part1[i].getInt();
            }
            // 1st ([0]) element must be 2 otherwise no longer Record Type 2
            if (recordType2FixedPart1[0] != 2) {
                dbgLog.warning(j + "-th RT header value is no longet RT2! " + recordType2FixedPart1[0]);
                break;
            }
            dbgLog.fine("variable type[must be 2]=" + recordType2FixedPart1[0]);
            // 2.3 variable name: 8 byte(space[x20]-padded)
            // This field is located at the very end of the 32 byte
            // fixed-size RT2 header (bytes 24-31).
            // We are processing it now, so that
            // we can make the decision on whether this variable is part
            // of a compound variable:
            String RawVariableName = getNullStrippedString(new String(Arrays.copyOfRange(recordType2Fixed, 24, (24 + LENGTH_VARIABLE_NAME)), defaultCharSet));
            // offset +=LENGTH_VARIABLE_NAME;
            String variableName = null;
            if (RawVariableName.indexOf(' ') >= 0) {
                variableName = RawVariableName.substring(0, RawVariableName.indexOf(' '));
            } else {
                variableName = RawVariableName;
            }
            // 2nd ([1]) element: numeric variable = 0 :for string variable
            // this block indicates its datum-length, i.e, >0 ;
            // if -1, this RT2 unit is a non-1st RT2 unit for a string variable
            // whose value is longer than 8 character.
            boolean isNumericVariable = false;
            dbgLog.fine("variable type(0: numeric; > 0: String;-1 continue )=" + recordType2FixedPart1[1]);
            // OBSwiseTypelList.add(recordType2FixedPart1[1]);
            int HowManyRt2Units = 1;
            if (recordType2FixedPart1[1] == -1) {
                dbgLog.fine("this RT2 is an 8 bit continuation chunk of an earlier string variable");
                if (obs255) {
                    if (obsSeqNumber < 30) {
                        OBSwiseTypelList.add(recordType2FixedPart1[1]);
                        obsSeqNumber++;
                    } else {
                        OBSwiseTypelList.add(-2);
                        obs255 = false;
                        obsSeqNumber = 0;
                    }
                } else {
                    OBSwiseTypelList.add(recordType2FixedPart1[1]);
                }
                obsNonVariableBlockSet.add(j);
                continue;
            } else if (recordType2FixedPart1[1] == 0) {
                // This is a numeric variable
                extendedVariableMode = false;
                // And as such, it cannot be an extension of a
                // previous, long string variable.
                OBSwiseTypelList.add(recordType2FixedPart1[1]);
                variableCounter++;
                isNumericVariable = true;
                variableTypelList.add(recordType2FixedPart1[1]);
            } else if (recordType2FixedPart1[1] > 0) {
                if (recordType2FixedPart1[1] == 255) {
                    obs255 = true;
                }
                if (lastVariableIsExtendable) {
                    String varNameBase = null;
                    if (lastVariableName.length() > 5) {
                        varNameBase = lastVariableName.substring(0, 5);
                    } else {
                        varNameBase = lastVariableName;
                    }
                    if (extendedVariableMode) {
                        if (variableNameIsAnIncrement(varNameBase, lastExtendedVariable, variableName)) {
                            OBSwiseTypelList.add(-1);
                            lastExtendedVariable = variableName;
                        // OK, we stay in the "extended variable" mode;
                        // but we can't move on to the next OBS (hence the commented out
                        // "continue" below:
                        // continue;
                        // see the next comment below for the explanation.
                        // 
                        // Should we also set "extendable" flag to false at this point
                        // if it's shorter than 255 bytes, i.e. the last extended chunk?
                        } else {
                            extendedVariableMode = false;
                        }
                    } else {
                        if (variableNameIsAnIncrement(varNameBase, variableName)) {
                            OBSwiseTypelList.add(-1);
                            extendedVariableMode = true;
                            dbgLog.fine("RT2: in extended variable mode; variable " + variableName);
                            lastExtendedVariable = variableName;
                        // Before we move on to the next OBS unit, we need to check
                        // if this current extended variable has its own label specified;
                        // If so, we need to determine its length, then read and skip
                        // that many bytes.
                        // Hence the commented out "continue" below:
                        // continue;
                        }
                    }
                }
                if (!extendedVariableMode) {
                    // OK, this is a "real"
                    // string variable, and not a continuation chunk of a compound
                    // string.
                    OBSwiseTypelList.add(recordType2FixedPart1[1]);
                    variableCounter++;
                    if (recordType2FixedPart1[1] == 255) {
                        // This variable is 255 bytes long, i.e. this is
                        // either the single "atomic" variable of the
                        // max allowed size, or it's a 255 byte segment
                        // of a compound variable. So we will check
                        // the next variable and see if it is the continuation
                        // of this one.
                        lastVariableIsExtendable = true;
                    } else {
                        lastVariableIsExtendable = false;
                    }
                    if (recordType2FixedPart1[1] % LENGTH_SAV_OBS_BLOCK == 0) {
                        HowManyRt2Units = recordType2FixedPart1[1] / LENGTH_SAV_OBS_BLOCK;
                    } else {
                        HowManyRt2Units = recordType2FixedPart1[1] / LENGTH_SAV_OBS_BLOCK + 1;
                    }
                    variableTypelList.add(recordType2FixedPart1[1]);
                }
            }
            if (!extendedVariableMode) {
                // Again, we only want to do the following steps for the "real"
                // variables, not the chunks of split mega-variables:
                dbgLog.fine("RT2: HowManyRt2Units for this variable=" + HowManyRt2Units);
                lastVariableName = variableName;
                // caseWeightVariableOBSIndex starts from 1: 0 is used for does-not-exist cases
                if (j == (caseWeightVariableOBSIndex - 1)) {
                    caseWeightVariableName = variableName;
                    // TODO: do we need this "index"? -- 4.0 alpha
                    caseWeightVariableIndex = variableCounter;
                // /smd.setCaseWeightVariableName(caseWeightVariableName);
                // /smd.getFileInformation().put("caseWeightVariableIndex", caseWeightVariableIndex);
                }
                OBSIndexToVariableName.put(j, variableName);
                // dbgLog.fine("\nvariable name="+variableName+"<-");
                dbgLog.fine("RT2: " + j + "-th variable name=" + variableName + "<-");
                dbgLog.fine("RT2: raw variable: " + RawVariableName);
                variableNameList.add(variableName);
            }
            // 3rd ([2]) element: = 1 variable-label block follows; 0 = no label
            // 
            dbgLog.fine("RT: variable label follows?(1:yes; 0: no)=" + recordType2FixedPart1[2]);
            boolean hasVariableLabel = recordType2FixedPart1[2] == 1 ? true : false;
            if ((recordType2FixedPart1[2] != 0) && (recordType2FixedPart1[2] != 1)) {
                throw new IOException("RT2: reading error: value is neither 0 or 1" + recordType2FixedPart1[2]);
            }
            if (hasVariableLabel) {
                byte[] length_variable_label = new byte[4];
                int nbytes_2_4 = stream.read(length_variable_label);
                if (nbytes_2_4 == 0) {
                    throw new IOException("RT 2: error reading recordType2.4: no bytes read!");
                } else {
                    dbgLog.fine("nbytes_2_4=" + nbytes_2_4);
                }
                ByteBuffer bb_length_variable_label = ByteBuffer.wrap(length_variable_label, 0, LENGTH_VARIABLE_LABEL);
                if (isLittleEndian) {
                    bb_length_variable_label.order(ByteOrder.LITTLE_ENDIAN);
                }
                int rawVariableLabelLength = bb_length_variable_label.getInt();
                dbgLog.fine("rawVariableLabelLength=" + rawVariableLabelLength);
                int variableLabelLength = getSAVintAdjustedBlockLength(rawVariableLabelLength);
                dbgLog.fine("RT2: variableLabelLength=" + variableLabelLength);
                // 2.5 [optional]variable label whose length is found at 2.4
                String variableLabel = "";
                if (rawVariableLabelLength > 0) {
                    byte[] variable_label = new byte[variableLabelLength];
                    int nbytes_2_5 = stream.read(variable_label);
                    if (nbytes_2_5 == 0) {
                        throw new IOException("RT 2: error reading recordType2.5: " + variableLabelLength + " bytes requested, no bytes read!");
                    } else {
                        dbgLog.fine("nbytes_2_5=" + nbytes_2_5);
                    }
                    variableLabel = getNullStrippedString(new String(Arrays.copyOfRange(variable_label, 0, rawVariableLabelLength), defaultCharSet));
                    dbgLog.fine("RT2: variableLabel=" + variableLabel + "<-");
                    dbgLog.fine(variableName + " => " + variableLabel);
                } else {
                    dbgLog.fine("RT2: defaulting to empty variable label.");
                }
                if (!extendedVariableMode) {
                    // We only have any use for this label if it's a "real" variable.
                    // Thinking about it, it doesn't make much sense for the "fake"
                    // variables that are actually chunks of large strings to store
                    // their own labels. But in some files they do. Then failing to read
                    // the bytes would result in getting out of sync with the RT record
                    // borders. So we always read the bytes, but only use them for
                    // the real variable entries.
                    /*String variableLabel = new String(Arrays.copyOfRange(variable_label,
                                0, rawVariableLabelLength),"US-ASCII");*/
                    variableLabelMap.put(variableName, variableLabel);
                }
            }
            if (extendedVariableMode) {
                // that's how SPSS stores them.
                continue;
            }
            // 4th ([3]) element: Missing value type code
            // 0[none], 1, 2, 3 [point-type],-2[range], -3 [range type+ point]
            dbgLog.fine("RT: missing value unit follows?(if 0, none)=" + recordType2FixedPart1[3]);
            boolean hasMissingValues = (validMissingValueCodeSet.contains(recordType2FixedPart1[3]) && (recordType2FixedPart1[3] != 0)) ? true : false;
            InvalidData invalidDataInfo = null;
            if (recordType2FixedPart1[3] != 0) {
                invalidDataInfo = new InvalidData(recordType2FixedPart1[3]);
                dbgLog.fine("RT: missing value type=" + invalidDataInfo.getType());
            }
            // 2.2: print/write formats: 4-byte each = 8 bytes
            byte[] printFormt = Arrays.copyOfRange(recordType2Fixed, offset, offset + LENGTH_PRINT_FORMAT_CODE);
            dbgLog.fine("printFrmt=" + new String(Hex.encodeHex(printFormt)));
            offset += LENGTH_PRINT_FORMAT_CODE;
            int formatCode = isLittleEndian ? printFormt[2] : printFormt[1];
            int formatWidth = isLittleEndian ? printFormt[1] : printFormt[2];
            // TODO:
            // What should we be doing with these "format decimal positions"
            // in 4.0?
            // -- L.A. 4.0 alpha
            int formatDecimalPointPosition = isLittleEndian ? printFormt[0] : printFormt[3];
            dbgLog.fine("RT2: format code{5=F, 1=A[String]}=" + formatCode);
            formatDecimalPointPositionList.add(formatDecimalPointPosition);
            if (!SPSSConstants.FORMAT_CODE_TABLE_SAV.containsKey(formatCode)) {
                throw new IOException("Unknown format code was found = " + formatCode);
            } else {
                printFormatList.add(formatCode);
            }
            byte[] writeFormt = Arrays.copyOfRange(recordType2Fixed, offset, offset + LENGTH_WRITE_FORMAT_CODE);
            dbgLog.fine("RT2: writeFrmt=" + new String(Hex.encodeHex(writeFormt)));
            if (writeFormt[3] != 0x00) {
                dbgLog.fine("byte-order(write format): reversal required");
            }
            offset += LENGTH_WRITE_FORMAT_CODE;
            if (!SPSSConstants.ORDINARY_FORMAT_CODE_SET.contains(formatCode)) {
                StringBuilder sb = new StringBuilder(SPSSConstants.FORMAT_CODE_TABLE_SAV.get(formatCode) + formatWidth);
                if (formatDecimalPointPosition > 0) {
                    sb.append("." + formatDecimalPointPosition);
                }
                dbgLog.fine("formattable[i] = " + variableName + " -> " + sb.toString());
                printFormatNameTable.put(variableName, sb.toString());
            }
            printFormatTable.put(variableName, SPSSConstants.FORMAT_CODE_TABLE_SAV.get(formatCode));
            if (hasMissingValues) {
                dbgLog.fine("RT2: decoding missing value: type=" + recordType2FixedPart1[3]);
                int howManyMissingValueUnits = missingValueCodeUnits.get(recordType2FixedPart1[3]);
                // int howManyMissingValueUnits = recordType2FixedPart1[3] > 0 ? recordType2FixedPart1[3] :  0;
                dbgLog.fine("RT2: howManyMissingValueUnits=" + howManyMissingValueUnits);
                byte[] missing_value_code_units = new byte[LENGTH_SAV_OBS_BLOCK * howManyMissingValueUnits];
                int nbytes_2_6 = stream.read(missing_value_code_units);
                if (nbytes_2_6 == 0) {
                    throw new IOException("RT 2: reading recordType2.6: no byte was read");
                } else {
                    dbgLog.fine("nbytes_2_6=" + nbytes_2_6);
                }
                if (isNumericVariable) {
                    double[] missingValues = new double[howManyMissingValueUnits];
                    // List<String> mvp = new ArrayList<String>();
                    List<String> mv = new ArrayList<String>();
                    ByteBuffer[] bb_missig_value_code = new ByteBuffer[howManyMissingValueUnits];
                    int offset_start = 0;
                    for (int i = 0; i < howManyMissingValueUnits; i++) {
                        bb_missig_value_code[i] = ByteBuffer.wrap(missing_value_code_units, offset_start, LENGTH_SAV_OBS_BLOCK);
                        offset_start += LENGTH_SAV_OBS_BLOCK;
                        if (isLittleEndian) {
                            bb_missig_value_code[i].order(ByteOrder.LITTLE_ENDIAN);
                        }
                        ByteBuffer temp = bb_missig_value_code[i].duplicate();
                        missingValues[i] = bb_missig_value_code[i].getDouble();
                        if (Double.toHexString(missingValues[i]).equals("-0x1.ffffffffffffep1023")) {
                            dbgLog.fine("1st value is LOWEST");
                            mv.add(Double.toHexString(missingValues[i]));
                        } else if (Double.valueOf(missingValues[i]).equals(Double.MAX_VALUE)) {
                            dbgLog.fine("2nd value is HIGHEST");
                            mv.add(Double.toHexString(missingValues[i]));
                        } else {
                            mv.add(doubleNumberFormatter.format(missingValues[i]));
                        }
                        dbgLog.fine(i + "-th missing value=" + Double.toHexString(missingValues[i]));
                    }
                    dbgLog.fine("variableName=" + variableName);
                    if (recordType2FixedPart1[3] > 0) {
                        // point cases only
                        dbgLog.fine("mv(>0)=" + mv);
                        missingValueTable.put(variableName, mv);
                        invalidDataInfo.setInvalidValues(mv);
                    } else if (recordType2FixedPart1[3] == -2) {
                        dbgLog.fine("mv(-2)=" + mv);
                        // range
                        invalidDataInfo.setInvalidRange(mv);
                    } else if (recordType2FixedPart1[3] == -3) {
                        // mixed case
                        dbgLog.fine("mv(-3)=" + mv);
                        invalidDataInfo.setInvalidRange(mv.subList(0, 2));
                        invalidDataInfo.setInvalidValues(mv.subList(2, 3));
                        missingValueTable.put(variableName, mv.subList(2, 3));
                    }
                    dbgLog.fine("missing value=" + StringUtils.join(missingValueTable.get(variableName), "|"));
                    dbgLog.fine("invalidDataInfo(Numeric):\n" + invalidDataInfo);
                    invalidDataTable.put(variableName, invalidDataInfo);
                } else {
                    // string variable case
                    String[] missingValues = new String[howManyMissingValueUnits];
                    List<String> mv = new ArrayList<String>();
                    int offset_start = 0;
                    int offset_end = LENGTH_SAV_OBS_BLOCK;
                    for (int i = 0; i < howManyMissingValueUnits; i++) {
                        missingValues[i] = StringUtils.stripEnd(new String(Arrays.copyOfRange(missing_value_code_units, offset_start, offset_end), defaultCharSet), " ");
                        dbgLog.fine("missing value=" + missingValues[i] + "<-");
                        offset_start = offset_end;
                        offset_end += LENGTH_SAV_OBS_BLOCK;
                        mv.add(missingValues[i]);
                    }
                    invalidDataInfo.setInvalidValues(mv);
                    missingValueTable.put(variableName, mv);
                    invalidDataTable.put(variableName, invalidDataInfo);
                    dbgLog.fine("missing value(str)=" + StringUtils.join(missingValueTable.get(variableName), "|"));
                    dbgLog.fine("invalidDataInfo(String):\n" + invalidDataInfo);
                }
                // string case
                dbgLog.fine("invalidDataTable:\n" + invalidDataTable);
            }
        // if msv
        } catch (IOException ex) {
            // ex.printStackTrace();
            throw ex;
        } catch (Exception ex) {
            ex.printStackTrace();
        // should we be throwing some exception here?
        }
    }
    if (j != OBSUnitsPerCase) {
        dbgLog.fine("RT2: attention! didn't reach the end of the OBS list!");
        throw new IOException("RT2: didn't reach the end of the OBS list!");
    }
    dbgLog.fine("RT2 metadata-related exit-chores");
    // /smd.getFileInformation().put("varQnty", variableCounter);
    dataTable.setVarQuantity(new Long(variableCounter));
    dbgLog.fine("RT2: varQnty=" + variableCounter);
    // 4.0 Initialize variables:
    List<DataVariable> variableList = new ArrayList<DataVariable>();
    for (int i = 0; i < variableCounter; i++) {
        DataVariable dv = new DataVariable();
        String varName = variableNameList.get(i);
        dbgLog.fine("name: " + varName);
        dv.setName(varName);
        String varLabel = variableLabelMap.get(varName);
        if (varLabel != null && varLabel.length() > 255) {
            // TODO:
            // variable labels will be changed into type 'TEXT' in the
            // database - this will eliminate the 255 char. limit.
            // -- L.A. 4.0 beta11
            dbgLog.fine("Have to truncate label: " + varLabel);
            varLabel = varLabel.substring(0, 255);
        }
        dbgLog.fine("label: " + varLabel);
        dv.setLabel(varLabel);
        dv.setInvalidRanges(new ArrayList<VariableRange>());
        dv.setSummaryStatistics(new ArrayList<SummaryStatistic>());
        dv.setUnf("UNF:6:");
        dv.setCategories(new ArrayList<VariableCategory>());
        variableList.add(dv);
        dv.setFileOrder(i);
        dv.setDataTable(dataTable);
    }
    dataTable.setDataVariables(variableList);
    // /smd.setVariableName(variableNameList.toArray(new String[variableNameList.size()]));
    // /smd.setVariableLabel(variableLabelMap);
    // TODO:
    // figure out what to do with the missing value table!
    // -- 4.0 alpha
    // well, they were used to generate merged summary statistics for
    // the variable. So need to verify what the DDI import was doing
    // with them and replicate the same in 4.0.
    // (add appropriate value labels?)
    // /TODO: 4.0 smd.setMissingValueTable(missingValueTable);
    // /smd.getFileInformation().put("caseWeightVariableName", caseWeightVariableName);
    dbgLog.fine("sumstat:long case=" + Arrays.deepToString(variableTypelList.toArray()));
    dbgLog.fine("RT2: OBSwiseTypelList=" + OBSwiseTypelList);
    dbgLog.fine("decodeRecordType2(): end");
}

Also used : DataVariable(edu.harvard.iq.dataverse.datavariable.DataVariable) VariableRange(edu.harvard.iq.dataverse.datavariable.VariableRange) VariableCategory(edu.harvard.iq.dataverse.datavariable.VariableCategory) SummaryStatistic(edu.harvard.iq.dataverse.datavariable.SummaryStatistic) InvalidData(edu.harvard.iq.dataverse.ingest.tabulardata.InvalidData)

Example 7 with VariableCategory

use of edu.harvard.iq.dataverse.datavariable.VariableCategory in project dataverse by IQSS.

the class DdiExportUtil method createVarDDI.

private static void createVarDDI(XMLStreamWriter xmlw, DataVariable dv) throws XMLStreamException {
    xmlw.writeStartElement("var");
    writeAttribute(xmlw, "ID", "v" + dv.getId().toString());
    writeAttribute(xmlw, "name", dv.getName());
    if (dv.getNumberOfDecimalPoints() != null) {
        writeAttribute(xmlw, "dcml", dv.getNumberOfDecimalPoints().toString());
    }
    if (dv.isOrderedCategorical()) {
        writeAttribute(xmlw, "nature", "ordinal");
    }
    if (dv.getInterval() != null) {
        String interval = dv.getIntervalLabel();
        if (interval != null) {
            writeAttribute(xmlw, "intrvl", interval);
        }
    }
    // location
    xmlw.writeEmptyElement("location");
    if (dv.getFileStartPosition() != null) {
        writeAttribute(xmlw, "StartPos", dv.getFileStartPosition().toString());
    }
    if (dv.getFileEndPosition() != null) {
        writeAttribute(xmlw, "EndPos", dv.getFileEndPosition().toString());
    }
    if (dv.getRecordSegmentNumber() != null) {
        writeAttribute(xmlw, "RecSegNo", dv.getRecordSegmentNumber().toString());
    }
    writeAttribute(xmlw, "fileid", "f" + dv.getDataTable().getDataFile().getId().toString());
    // labl
    if (!StringUtilisEmpty(dv.getLabel())) {
        xmlw.writeStartElement("labl");
        writeAttribute(xmlw, "level", "variable");
        xmlw.writeCharacters(dv.getLabel());
        // labl
        xmlw.writeEndElement();
    }
    // invalrng
    boolean invalrngAdded = false;
    for (VariableRange range : dv.getInvalidRanges()) {
        // if (range.getBeginValueType() != null && range.getBeginValueType().getName().equals(DB_VAR_RANGE_TYPE_POINT)) {
        if (range.getBeginValueType() != null && range.isBeginValueTypePoint()) {
            if (range.getBeginValue() != null) {
                invalrngAdded = checkParentElement(xmlw, "invalrng", invalrngAdded);
                xmlw.writeEmptyElement("item");
                writeAttribute(xmlw, "VALUE", range.getBeginValue());
            }
        } else {
            invalrngAdded = checkParentElement(xmlw, "invalrng", invalrngAdded);
            xmlw.writeEmptyElement("range");
            if (range.getBeginValueType() != null && range.getBeginValue() != null) {
                if (range.isBeginValueTypeMin()) {
                    writeAttribute(xmlw, "min", range.getBeginValue());
                } else if (range.isBeginValueTypeMinExcl()) {
                    writeAttribute(xmlw, "minExclusive", range.getBeginValue());
                }
            }
            if (range.getEndValueType() != null && range.getEndValue() != null) {
                if (range.isEndValueTypeMax()) {
                    writeAttribute(xmlw, "max", range.getEndValue());
                } else if (range.isEndValueTypeMaxExcl()) {
                    writeAttribute(xmlw, "maxExclusive", range.getEndValue());
                }
            }
        }
    }
    if (invalrngAdded) {
        // invalrng
        xmlw.writeEndElement();
    }
    // universe
    if (!StringUtilisEmpty(dv.getUniverse())) {
        xmlw.writeStartElement("universe");
        xmlw.writeCharacters(dv.getUniverse());
        // universe
        xmlw.writeEndElement();
    }
    // sum stats
    for (SummaryStatistic sumStat : dv.getSummaryStatistics()) {
        xmlw.writeStartElement("sumStat");
        if (sumStat.getTypeLabel() != null) {
            writeAttribute(xmlw, "type", sumStat.getTypeLabel());
        } else {
            writeAttribute(xmlw, "type", "unknown");
        }
        xmlw.writeCharacters(sumStat.getValue());
        // sumStat
        xmlw.writeEndElement();
    }
    // categories
    for (VariableCategory cat : dv.getCategories()) {
        xmlw.writeStartElement("catgry");
        if (cat.isMissing()) {
            writeAttribute(xmlw, "missing", "Y");
        }
        // catValu
        xmlw.writeStartElement("catValu");
        xmlw.writeCharacters(cat.getValue());
        // catValu
        xmlw.writeEndElement();
        // label
        if (!StringUtilisEmpty(cat.getLabel())) {
            xmlw.writeStartElement("labl");
            writeAttribute(xmlw, "level", "category");
            xmlw.writeCharacters(cat.getLabel());
            // labl
            xmlw.writeEndElement();
        }
        // catStat
        if (cat.getFrequency() != null) {
            xmlw.writeStartElement("catStat");
            writeAttribute(xmlw, "type", "freq");
            // if frequency is actually a long value, we want to write "100" instead of "100.0"
            if (Math.floor(cat.getFrequency()) == cat.getFrequency()) {
                xmlw.writeCharacters(new Long(cat.getFrequency().longValue()).toString());
            } else {
                xmlw.writeCharacters(cat.getFrequency().toString());
            }
            // catStat
            xmlw.writeEndElement();
        }
        // catgry
        xmlw.writeEndElement();
    }
    // varFormat
    xmlw.writeEmptyElement("varFormat");
    if (dv.isTypeNumeric()) {
        writeAttribute(xmlw, "type", "numeric");
    } else if (dv.isTypeCharacter()) {
        writeAttribute(xmlw, "type", "character");
    } else {
        throw new XMLStreamException("Illegal Variable Format Type!");
    }
    writeAttribute(xmlw, "formatname", dv.getFormat());
    // experiment writeAttribute(xmlw, "schema", dv.getFormatSchema());
    writeAttribute(xmlw, "category", dv.getFormatCategory());
    // notes
    if (dv.getUnf() != null && !"".equals(dv.getUnf())) {
        xmlw.writeStartElement("notes");
        writeAttribute(xmlw, "subject", "Universal Numeric Fingerprint");
        writeAttribute(xmlw, "level", "variable");
        writeAttribute(xmlw, "type", "Dataverse:UNF");
        xmlw.writeCharacters(dv.getUnf());
        // notes
        xmlw.writeEndElement();
    }
    // var
    xmlw.writeEndElement();
}

Also used : VariableCategory(edu.harvard.iq.dataverse.datavariable.VariableCategory) XMLStreamException(javax.xml.stream.XMLStreamException) VariableRange(edu.harvard.iq.dataverse.datavariable.VariableRange) SummaryStatistic(edu.harvard.iq.dataverse.datavariable.SummaryStatistic)

Example 8 with VariableCategory

use of edu.harvard.iq.dataverse.datavariable.VariableCategory in project dataverse by IQSS.

the class DDIExportServiceBean method createVarDDI.

private void createVarDDI(XMLStreamWriter xmlw, Set<String> excludedFieldSet, Set<String> includedFieldSet, DataVariable dv) throws XMLStreamException {
    xmlw.writeStartElement("var");
    writeAttribute(xmlw, "ID", "v" + dv.getId().toString());
    writeAttribute(xmlw, "name", dv.getName());
    if (dv.getNumberOfDecimalPoints() != null) {
        writeAttribute(xmlw, "dcml", dv.getNumberOfDecimalPoints().toString());
    }
    if (dv.isOrderedCategorical()) {
        writeAttribute(xmlw, "nature", "ordinal");
    }
    if (dv.getInterval() != null) {
        String interval = dv.getIntervalLabel();
        if (interval != null) {
            writeAttribute(xmlw, "intrvl", interval);
        }
    }
    // location
    if (checkField("location", excludedFieldSet, includedFieldSet)) {
        xmlw.writeEmptyElement("location");
        if (dv.getFileStartPosition() != null) {
            writeAttribute(xmlw, "StartPos", dv.getFileStartPosition().toString());
        }
        if (dv.getFileEndPosition() != null) {
            writeAttribute(xmlw, "EndPos", dv.getFileEndPosition().toString());
        }
        if (dv.getRecordSegmentNumber() != null) {
            writeAttribute(xmlw, "RecSegNo", dv.getRecordSegmentNumber().toString());
        }
        writeAttribute(xmlw, "fileid", "f" + dv.getDataTable().getDataFile().getId().toString());
    }
    // labl
    if (checkField("labl", excludedFieldSet, includedFieldSet)) {
        if (!StringUtilisEmpty(dv.getLabel())) {
            xmlw.writeStartElement("labl");
            writeAttribute(xmlw, "level", "variable");
            xmlw.writeCharacters(dv.getLabel());
            // labl
            xmlw.writeEndElement();
        }
    }
    // invalrng
    if (checkField("invalrng", excludedFieldSet, includedFieldSet)) {
        boolean invalrngAdded = false;
        for (VariableRange range : dv.getInvalidRanges()) {
            // if (range.getBeginValueType() != null && range.getBeginValueType().getName().equals(DB_VAR_RANGE_TYPE_POINT)) {
            if (range.getBeginValueType() != null && range.isBeginValueTypePoint()) {
                if (range.getBeginValue() != null) {
                    invalrngAdded = checkParentElement(xmlw, "invalrng", invalrngAdded);
                    xmlw.writeEmptyElement("item");
                    writeAttribute(xmlw, "VALUE", range.getBeginValue());
                }
            } else {
                invalrngAdded = checkParentElement(xmlw, "invalrng", invalrngAdded);
                xmlw.writeEmptyElement("range");
                if (range.getBeginValueType() != null && range.getBeginValue() != null) {
                    if (range.isBeginValueTypeMin()) {
                        writeAttribute(xmlw, "min", range.getBeginValue());
                    } else if (range.isBeginValueTypeMinExcl()) {
                        writeAttribute(xmlw, "minExclusive", range.getBeginValue());
                    }
                }
                if (range.getEndValueType() != null && range.getEndValue() != null) {
                    if (range.isEndValueTypeMax()) {
                        writeAttribute(xmlw, "max", range.getEndValue());
                    } else if (range.isEndValueTypeMaxExcl()) {
                        writeAttribute(xmlw, "maxExclusive", range.getEndValue());
                    }
                }
            }
        }
        if (invalrngAdded) {
            // invalrng
            xmlw.writeEndElement();
        }
    }
    // universe
    if (checkField("universe", excludedFieldSet, includedFieldSet)) {
        if (!StringUtilisEmpty(dv.getUniverse())) {
            xmlw.writeStartElement("universe");
            xmlw.writeCharacters(dv.getUniverse());
            // universe
            xmlw.writeEndElement();
        }
    }
    // sum stats
    if (checkField("sumStat", excludedFieldSet, includedFieldSet)) {
        for (SummaryStatistic sumStat : dv.getSummaryStatistics()) {
            xmlw.writeStartElement("sumStat");
            if (sumStat.getTypeLabel() != null) {
                writeAttribute(xmlw, "type", sumStat.getTypeLabel());
            } else {
                writeAttribute(xmlw, "type", "unknown");
            }
            xmlw.writeCharacters(sumStat.getValue());
            // sumStat
            xmlw.writeEndElement();
        }
    }
    // categories
    if (checkField("catgry", excludedFieldSet, includedFieldSet)) {
        for (VariableCategory cat : dv.getCategories()) {
            xmlw.writeStartElement("catgry");
            if (cat.isMissing()) {
                writeAttribute(xmlw, "missing", "Y");
            }
            // catValu
            xmlw.writeStartElement("catValu");
            xmlw.writeCharacters(cat.getValue());
            // catValu
            xmlw.writeEndElement();
            // label
            if (!StringUtilisEmpty(cat.getLabel())) {
                xmlw.writeStartElement("labl");
                writeAttribute(xmlw, "level", "category");
                xmlw.writeCharacters(cat.getLabel());
                // labl
                xmlw.writeEndElement();
            }
            // catStat
            if (cat.getFrequency() != null) {
                xmlw.writeStartElement("catStat");
                writeAttribute(xmlw, "type", "freq");
                // if frequency is actually a long value, we want to write "100" instead of "100.0"
                if (Math.floor(cat.getFrequency()) == cat.getFrequency()) {
                    xmlw.writeCharacters(new Long(cat.getFrequency().longValue()).toString());
                } else {
                    xmlw.writeCharacters(cat.getFrequency().toString());
                }
                // catStat
                xmlw.writeEndElement();
            }
            // catgry
            xmlw.writeEndElement();
        }
    }
    // varFormat
    if (checkField("varFormat", excludedFieldSet, includedFieldSet)) {
        xmlw.writeEmptyElement("varFormat");
        if (dv.isTypeNumeric()) {
            writeAttribute(xmlw, "type", "numeric");
        } else if (dv.isTypeCharacter()) {
            writeAttribute(xmlw, "type", "character");
        } else {
            throw new XMLStreamException("Illegal Variable Format Type!");
        }
        writeAttribute(xmlw, "formatname", dv.getFormat());
        // experiment writeAttribute(xmlw, "schema", dv.getFormatSchema());
        writeAttribute(xmlw, "category", dv.getFormatCategory());
    }
    // notes
    if (checkField("unf", excludedFieldSet, includedFieldSet)) {
        xmlw.writeStartElement("notes");
        writeAttribute(xmlw, "subject", "Universal Numeric Fingerprint");
        writeAttribute(xmlw, "level", "variable");
        writeAttribute(xmlw, "type", "VDC:UNF");
        xmlw.writeCharacters(dv.getUnf());
        // notes
        xmlw.writeEndElement();
    }
    // var
    xmlw.writeEndElement();
}

Example 9 with VariableCategory

use of edu.harvard.iq.dataverse.datavariable.VariableCategory in project dataverse by IQSS.

the class DTA117FileReader method readValueLabels.

private void readValueLabels(DataReader reader) throws IOException {
    logger.fine("Value Labels section; at offset " + reader.getByteOffset() + "; dta map offset: " + dtaMap.getOffset_vallabs());
    logger.fine("readValueLabels(): start.");
    // TODO:
    // check that we are at the right byte offset!
    reader.readOpeningTag(TAG_VALUE_LABELS);
    while (reader.checkTag("<" + TAG_VALUE_LABELS_LBL_DEF + ">")) {
        // TODO: checktag should probably *read* the tag, if it is indeed
        // encountered, rather then stop at the beginning of the tag.
        reader.readOpeningTag(TAG_VALUE_LABELS_LBL_DEF);
        long label_table_length = reader.readInteger();
        // TODO:
        // think of better variable names...
        String label_table_name = reader.readString(33);
        // TODO:
        // do we need to worry about uniquness? or has Stata already
        // guaranteed that there are no other category value table
        // defined under this name?
        // TODO: skipBytes() instead
        reader.readBytes(3);
        long value_category_offset = 0;
        // read the value_label_table that follows.
        // should be label_table_length.
        int number_of_categories = (int) reader.readInteger();
        long text_length = reader.readInteger();
        value_category_offset = 8;
        long[] value_label_offsets = new long[number_of_categories];
        long[] category_values = new long[number_of_categories];
        String[] category_value_labels = new String[number_of_categories];
        for (int i = 0; i < number_of_categories; i++) {
            value_label_offsets[i] = reader.readInteger();
            value_category_offset += 4;
        }
        for (int i = 0; i < number_of_categories; i++) {
            // TODO:
            // can the category values be negative?
            category_values[i] = reader.readInteger();
            value_category_offset += 4;
        }
        int total_label_bytes = 0;
        long label_offset = 0;
        long label_end = 0;
        int label_length = 0;
        for (int i = 0; i < number_of_categories; i++) {
            label_offset = value_label_offsets[i];
            label_end = i < number_of_categories - 1 ? value_label_offsets[i + 1] : text_length;
            label_length = (int) (label_end - label_offset);
            category_value_labels[i] = reader.readString(label_length);
            total_label_bytes += label_length;
        }
        value_category_offset += total_label_bytes;
        if (total_label_bytes != text_length) {
            throw new IOException("<read mismatch in readLabels()>");
        }
        if (value_category_offset != label_table_length) {
            throw new IOException("<read mismatch in readLabels() 2>");
        }
        reader.readClosingTag(TAG_VALUE_LABELS_LBL_DEF);
        // DataVariables:
        for (int i = 0; i < dataTable.getVarQuantity(); i++) {
            if (label_table_name.equals(valueLabelsLookupTable[i])) {
                logger.fine("cross-linking value label table for " + label_table_name);
                // -- L.A.
                for (int j = 0; j < number_of_categories; j++) {
                    VariableCategory cat = new VariableCategory();
                    long cat_value = category_values[j];
                    String cat_label = category_value_labels[j];
                    cat.setValue("" + cat_value);
                    cat.setLabel(cat_label);
                    /* cross-link the variable and category to each other: */
                    cat.setDataVariable(dataTable.getDataVariables().get(i));
                    dataTable.getDataVariables().get(i).getCategories().add(cat);
                }
            }
        }
    }
    reader.readClosingTag(TAG_VALUE_LABELS);
    logger.fine("readValueLabels(): end.");
}

Also used : VariableCategory(edu.harvard.iq.dataverse.datavariable.VariableCategory)

Example 10 with VariableCategory

use of edu.harvard.iq.dataverse.datavariable.VariableCategory in project dataverse by IQSS.

the class DTAFileReader method parseValueLabelsRelease105.

void parseValueLabelsRelease105(BufferedInputStream stream) throws IOException {
    dbgLog.fine("parseValueLabelsRelease105(): start");
    if (stream == null) {
        throw new IllegalArgumentException("stream == null!");
    }
    int nvar = dataTable.getVarQuantity().intValue();
    int length_label_name = constantTable.get("NAME") + 1;
    // note: caution +1 as the null character, not 9 byte
    int length_value_label_header = value_label_table_length + length_label_name;
    if (dbgLog.isLoggable(Level.FINE)) {
        dbgLog.fine("value_label_table_length=" + value_label_table_length);
    }
    if (dbgLog.isLoggable(Level.FINE)) {
        dbgLog.fine("length_value_label_header=" + length_value_label_header);
    }
    int length_lable_name_field = 8;
    /*
         Seg  field         byte    type
         1-1. no of pairs      2    int  (= m)
         1-2. vlt_name        10    includes char+(\0) == name used in Sec2.part 5
         -----------------------------------
         11
         2-1. values         2*n    int[]
         2-2. labels         8*n    char
         */
    // This map will hold a temporary lookup table for all the categorical
    // value-label groups we are going to find here:
    // These groups have unique names, and a group *may be shared* between
    // multiple variables. In the method decodeDescriptorValueLabel above
    // we have populated a lookup table where variables are linked to the
    // corresponding value-label groups by name. Thus we must fully populate
    // the full map of all the variable groups, then go through the list
    // of variables and create the dataverse variable categories from
    // them. -- L.A. 4.0
    Map<String, Map<String, String>> tempValueLabelTable = new LinkedHashMap<>();
    for (int i = 0; i < nvar; i++) {
        if (dbgLog.isLoggable(Level.FINE)) {
            dbgLog.fine("\n\n" + i + "th value-label table header");
        }
        byte[] valueLabelHeader = new byte[length_value_label_header];
        // Part 1: reading the header of a value-label table if exists
        int nbytes = stream.read(valueLabelHeader, 0, length_value_label_header);
        if (nbytes == 0) {
            throw new IOException("reading value label header: no datum");
        }
        // 1.1 number of value-label pairs in this table (= m)
        ByteBuffer bb_value_label_pairs = ByteBuffer.wrap(valueLabelHeader, 0, value_label_table_length);
        if (isLittleEndian) {
            bb_value_label_pairs.order(ByteOrder.LITTLE_ENDIAN);
        // if (dbgLog.isLoggable(Level.FINE)) dbgLog.fine("value lable table lenth: byte reversed");
        }
        int no_value_label_pairs = bb_value_label_pairs.getShort();
        if (dbgLog.isLoggable(Level.FINE)) {
            dbgLog.fine("no_value_label_pairs=" + no_value_label_pairs);
        }
        // 1.2 labelName
        String rawLabelName = new String(Arrays.copyOfRange(valueLabelHeader, value_label_table_length, (value_label_table_length + length_label_name)), "ISO-8859-1");
        if (dbgLog.isLoggable(Level.FINE)) {
            dbgLog.fine("rawLabelName(length)=" + rawLabelName.length());
        }
        String labelName = rawLabelName.substring(0, rawLabelName.indexOf(0));
        if (dbgLog.isLoggable(Level.FINE)) {
            dbgLog.fine("label name = " + labelName + "\n");
        }
        if (dbgLog.isLoggable(Level.FINE)) {
            dbgLog.fine(i + "-th value-label table");
        }
        // Part 2: reading the value-label table
        // the length of the value-label table is: 2*m + 8*m = 10*m
        int length_value_label_table = (value_label_table_length + length_lable_name_field) * no_value_label_pairs;
        if (dbgLog.isLoggable(Level.FINE)) {
            dbgLog.fine("length_value_label_table=" + length_value_label_table);
        }
        byte[] valueLabelTable_i = new byte[length_value_label_table];
        int noBytes = stream.read(valueLabelTable_i, 0, length_value_label_table);
        if (noBytes == 0) {
            throw new IOException("reading value label table: no datum");
        }
        // 2-1. 2-byte-integer array (2*m): value array (sorted)
        short[] valueList = new short[no_value_label_pairs];
        int offset_value = 0;
        for (int k = 0; k < no_value_label_pairs; k++) {
            ByteBuffer bb_value_list = ByteBuffer.wrap(valueLabelTable_i, offset_value, value_label_table_length);
            if (isLittleEndian) {
                bb_value_list.order(ByteOrder.LITTLE_ENDIAN);
            }
            valueList[k] = bb_value_list.getShort();
            offset_value += value_label_table_length;
        }
        if (dbgLog.isLoggable(Level.FINE)) {
            dbgLog.fine("value_list=" + Arrays.toString(valueList) + "\n");
        }
        // 2-2. 8-byte chars that store label data (m units of labels)
        if (dbgLog.isLoggable(Level.FINE)) {
            dbgLog.fine("current offset_value=" + offset_value);
        }
        int offset_start = offset_value;
        int offset_end = offset_value + length_lable_name_field;
        String[] labelList = new String[no_value_label_pairs];
        for (int l = 0; l < no_value_label_pairs; l++) {
            String string_l = new String(Arrays.copyOfRange(valueLabelTable_i, offset_start, offset_end), "ISO-8859-1");
            int null_position = string_l.indexOf(0);
            if (null_position != -1) {
                labelList[l] = string_l.substring(0, null_position);
            } else {
                labelList[l] = string_l;
            }
            offset_start = offset_end;
            offset_end += length_lable_name_field;
        }
        // Finally, we've reached the actual value-label pairs. We'll go
        // through them and put them on the temporary lookup map:
        tempValueLabelTable.put(labelName, new LinkedHashMap<>());
        for (int j = 0; j < no_value_label_pairs; j++) {
            if (dbgLog.isLoggable(Level.FINE)) {
                dbgLog.fine(j + "-th pair:" + valueList[j] + "[" + labelList[j] + "]");
            }
            // TODO: do we need any null/empty string checks here? -- L.A. 4.0
            tempValueLabelTable.get(labelName).put(Integer.toString(valueList[j]), labelList[j]);
        }
        if (stream.available() == 0) {
            // reached the end of the file
            if (dbgLog.isLoggable(Level.FINE)) {
                dbgLog.fine("reached the end of file at " + i + "th value-label Table.");
            }
            break;
        }
    }
    for (int i = 0; i < nvar; i++) {
        if (valueLabelsLookupTable[i] != null) {
            if (tempValueLabelTable.get(valueLabelsLookupTable[i]) != null) {
                // -- L.A.
                for (String value : tempValueLabelTable.get(valueLabelsLookupTable[i]).keySet()) {
                    VariableCategory cat = new VariableCategory();
                    cat.setValue(value);
                    cat.setLabel(tempValueLabelTable.get(valueLabelsLookupTable[i]).get(value));
                    /* cross-link the variable and category to each other: */
                    cat.setDataVariable(dataTable.getDataVariables().get(i));
                    dataTable.getDataVariables().get(i).getCategories().add(cat);
                }
            }
        }
    }
    dbgLog.fine("parseValueLabelsRelease105(): end");
}

Also used : VariableCategory(edu.harvard.iq.dataverse.datavariable.VariableCategory)

Aggregations

VariableCategory (edu.harvard.iq.dataverse.datavariable.VariableCategory)11 SummaryStatistic (edu.harvard.iq.dataverse.datavariable.SummaryStatistic)4 VariableRange (edu.harvard.iq.dataverse.datavariable.VariableRange)4 DataVariable (edu.harvard.iq.dataverse.datavariable.DataVariable)3 XMLStreamException (javax.xml.stream.XMLStreamException)2 InvalidData (edu.harvard.iq.dataverse.ingest.tabulardata.InvalidData)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 LinkedHashMap (java.util.LinkedHashMap)1 Map (java.util.Map)1 REXPMismatchException (org.rosuda.REngine.REXPMismatchException)1 RList (org.rosuda.REngine.RList)1