Search in sources :

Example 11 with VariableCategory

use of edu.harvard.iq.dataverse.datavariable.VariableCategory in project dataverse by IQSS.

the class RDATAFileReader method processVariableInfo.

/**
 * Get a HashMap matching column number to meta-data used in re-creating R
 * Objects
 *
 * @param metaInfo an "RList" Object containing indices - type, type.string,
 * class, levels, and format.
 * @param dataTable a dataverse DataTable object
 */
private void processVariableInfo(RList metaInfo, DataTable dataTable) throws IOException {
    // list(type = 1, type.string = "integer", class = class(values), levels = NULL, format = NULL)
    Integer variableType = -1;
    String variableTypeName = "", variableFormat = "";
    String[] variableLevels = null;
    for (int k = 0; k < metaInfo.size(); k++) {
        try {
            // Meta-data for a column in the data-set
            RList columnMeta = metaInfo.at(k).asList();
            // Extract information from the returned list
            variableType = !columnMeta.at("type").isNull() ? columnMeta.at("type").asInteger() : null;
            variableTypeName = !columnMeta.at("type.string").isNull() ? columnMeta.at("type.string").asString() : null;
            variableLevels = !columnMeta.at("levels").isNull() ? columnMeta.at("levels").asStrings() : new String[0];
            variableFormat = !columnMeta.at("format").isNull() ? columnMeta.at("format").asString() : null;
            LOG.fine("variable type: " + variableType);
            LOG.fine("variable type name: " + variableTypeName);
            LOG.fine("variable format: " + variableFormat);
            for (String variableLevel : variableLevels) {
                LOG.fine("variable level: " + variableLevel);
            }
            if (variableTypeName == null || variableTypeName.equals("character") || variableTypeName.equals("other")) {
                // This is a String:
                dataTable.getDataVariables().get(k).setTypeCharacter();
                dataTable.getDataVariables().get(k).setIntervalDiscrete();
            } else if (variableTypeName.equals("integer")) {
                dataTable.getDataVariables().get(k).setTypeNumeric();
                dataTable.getDataVariables().get(k).setIntervalDiscrete();
            } else if (variableTypeName.equals("numeric") || variableTypeName.equals("double")) {
                dataTable.getDataVariables().get(k).setTypeNumeric();
                dataTable.getDataVariables().get(k).setIntervalContinuous();
            } else if (variableTypeName.startsWith("Date")) {
                dataTable.getDataVariables().get(k).setTypeCharacter();
                dataTable.getDataVariables().get(k).setIntervalDiscrete();
                dataTable.getDataVariables().get(k).setFormat(variableFormat);
                // instead:
                if (variableTypeName.equals("Date")) {
                    dataTable.getDataVariables().get(k).setFormatCategory("date");
                } else if (variableTypeName.equals("DateTime")) {
                    dataTable.getDataVariables().get(k).setFormatCategory("time");
                }
            } else if (variableTypeName.equals("factor")) {
                // All R factors are *string* factors!
                dataTable.getDataVariables().get(k).setTypeCharacter();
                dataTable.getDataVariables().get(k).setIntervalDiscrete();
                if (variableLevels != null && variableLevels.length > 0) {
                    // yes, this is a factor, with levels defined.
                    LOG.fine("this is a factor.");
                    boolean ordered = false;
                    if (variableFormat != null && variableFormat.equals("ordered")) {
                        LOG.fine("an ordered factor, too");
                        ordered = true;
                    }
                    for (int i = 0; i < variableLevels.length; i++) {
                        VariableCategory cat = new VariableCategory();
                        cat.setValue(variableLevels[i]);
                        // Sadly, R factors don't have descriptive labels;
                        cat.setLabel(variableLevels[i]);
                        if (ordered) {
                            cat.setOrder(i + 1);
                        }
                        /* cross-link the variable and category to each other: */
                        cat.setDataVariable(dataTable.getDataVariables().get(k));
                        dataTable.getDataVariables().get(k).getCategories().add(cat);
                    }
                    dataTable.getDataVariables().get(k).setOrderedCategorical(ordered);
                }
            } else // we turn R factors into - above.
            if ("logical".equals(variableTypeName)) {
                dataTable.getDataVariables().get(k).setFormatCategory("Boolean");
                dataTable.getDataVariables().get(k).setTypeNumeric();
                dataTable.getDataVariables().get(k).setIntervalDiscrete();
                String[] booleanFactorLabels = new String[2];
                booleanFactorLabels[0] = "FALSE";
                booleanFactorLabels[1] = "TRUE";
                String[] booleanFactorValues = new String[2];
                booleanFactorValues[0] = "0";
                booleanFactorValues[1] = "1";
                for (int i = 0; i < 2; i++) {
                    VariableCategory cat = new VariableCategory();
                    cat.setValue(booleanFactorValues[i]);
                    // Sadly, R factors don't have descriptive labels;
                    cat.setLabel(booleanFactorLabels[i]);
                    /* cross-link the variable and category to each other: */
                    cat.setDataVariable(dataTable.getDataVariables().get(k));
                    dataTable.getDataVariables().get(k).getCategories().add(cat);
                }
            }
        // Store the meta-data in a hashmap (to return later)
        } catch (REXPMismatchException ex) {
            // If something went wrong, then it wasn't meant to be for that column.
            // And you know what? That's okay.
            ex.printStackTrace();
            LOG.fine(String.format("Could not process variable %d of the data frame.", k));
        }
    }
}
Also used : VariableCategory(edu.harvard.iq.dataverse.datavariable.VariableCategory) RList(org.rosuda.REngine.RList) REXPMismatchException(org.rosuda.REngine.REXPMismatchException)

Aggregations

VariableCategory (edu.harvard.iq.dataverse.datavariable.VariableCategory)11 SummaryStatistic (edu.harvard.iq.dataverse.datavariable.SummaryStatistic)4 VariableRange (edu.harvard.iq.dataverse.datavariable.VariableRange)4 DataVariable (edu.harvard.iq.dataverse.datavariable.DataVariable)3 XMLStreamException (javax.xml.stream.XMLStreamException)2 InvalidData (edu.harvard.iq.dataverse.ingest.tabulardata.InvalidData)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 LinkedHashMap (java.util.LinkedHashMap)1 Map (java.util.Map)1 REXPMismatchException (org.rosuda.REngine.REXPMismatchException)1 RList (org.rosuda.REngine.RList)1