Search in sources :

Example 16 with DataVariable

use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.

the class DDIExportServiceBean method createDataFileDDI.

private void createDataFileDDI(XMLStreamWriter xmlw, Set<String> excludedFieldSet, Set<String> includedFieldSet, DataFile df) throws XMLStreamException {
    /* This method will create both the <fileDscr> and <dataDscr><var> 
         * portions of the DDI that describe the tabular data contained in 
         * the file, the file-, datatable- and variable-level metadata; or 
         * a subset of the above, as defined by the "include" and "exclude" 
         * parameters. 
         */
    /* 
         * This method is only called when an /api/meta/file request comes 
         * in; i.e., for a study export, createFileDscr and createData/createVar 
         * methods will be called separately. So we need to create the top-level 
         * ddi (<codeBook>) tag header:
         */
    xmlw.writeStartElement("codeBook");
    xmlw.writeDefaultNamespace("http://www.icpsr.umich.edu/DDI");
    writeAttribute(xmlw, "version", "2.0");
    createStdyDscr(xmlw, excludedFieldSet, includedFieldSet, df.getOwner().getLatestVersion());
    DataTable dt = fileService.findDataTableByFileId(df.getId());
    if (checkField("fileDscr", excludedFieldSet, includedFieldSet)) {
        createFileDscr(xmlw, excludedFieldSet, null, df, dt);
    }
    // And now, the variables:
    xmlw.writeStartElement("dataDscr");
    if (checkField("var", excludedFieldSet, includedFieldSet)) {
        List<DataVariable> vars = variableService.findByDataTableId(dt.getId());
        for (DataVariable var : vars) {
            createVarDDI(xmlw, excludedFieldSet, null, var);
        }
    }
    // dataDscr
    xmlw.writeEndElement();
    // codeBook
    xmlw.writeEndElement();
}
Also used : DataTable(edu.harvard.iq.dataverse.DataTable) DataVariable(edu.harvard.iq.dataverse.datavariable.DataVariable)

Example 17 with DataVariable

use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.

the class DDIExportServiceBean method createDatasetDDI.

private void createDatasetDDI(XMLStreamWriter xmlw, Set<String> excludedFieldSet, Set<String> includedFieldSet, DatasetVersion version) throws XMLStreamException {
    xmlw.writeStartElement("codeBook");
    xmlw.writeDefaultNamespace("http://www.icpsr.umich.edu/DDI");
    writeAttribute(xmlw, "version", "2.0");
    createStdyDscr(xmlw, excludedFieldSet, includedFieldSet, version);
    // Files:
    List<FileMetadata> tabularDataFiles = new ArrayList<>();
    List<FileMetadata> otherDataFiles = new ArrayList<>();
    List<FileMetadata> fileMetadatas = version.getFileMetadatas();
    if (fileMetadatas == null || fileMetadatas.isEmpty()) {
        // codeBook
        xmlw.writeEndElement();
        return;
    }
    for (FileMetadata fileMetadata : fileMetadatas) {
        if (fileMetadata.getDataFile().isTabularData()) {
            tabularDataFiles.add(fileMetadata);
        } else {
            otherDataFiles.add(fileMetadata);
        }
    }
    if (checkField("fileDscr", excludedFieldSet, includedFieldSet)) {
        for (FileMetadata fileMetadata : tabularDataFiles) {
            DataTable dt = fileService.findDataTableByFileId(fileMetadata.getDataFile().getId());
            createFileDscr(xmlw, excludedFieldSet, includedFieldSet, fileMetadata.getDataFile(), dt);
        }
        // 2nd pass, to create data (variable) description sections:
        xmlw.writeStartElement("dataDscr");
        for (FileMetadata fileMetadata : tabularDataFiles) {
            DataTable dt = fileService.findDataTableByFileId(fileMetadata.getDataFile().getId());
            List<DataVariable> vars = variableService.findByDataTableId(dt.getId());
            for (DataVariable var : vars) {
                createVarDDI(xmlw, excludedFieldSet, null, var);
            }
        }
        // dataDscr
        xmlw.writeEndElement();
    }
    if (checkField("othrMat", excludedFieldSet, includedFieldSet)) {
        for (FileMetadata fileMetadata : otherDataFiles) {
            createOtherMat(xmlw, excludedFieldSet, includedFieldSet, fileMetadata);
        }
    }
    // codeBook
    xmlw.writeEndElement();
}
Also used : DataTable(edu.harvard.iq.dataverse.DataTable) FileMetadata(edu.harvard.iq.dataverse.FileMetadata) ArrayList(java.util.ArrayList) DataVariable(edu.harvard.iq.dataverse.datavariable.DataVariable)

Example 18 with DataVariable

use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.

the class CSVFileReader method readFile.

public int readFile(BufferedReader csvReader, DataTable dataTable, PrintWriter finalOut) throws IOException {
    List<DataVariable> variableList = new ArrayList<>();
    CSVParser parser = new CSVParser(csvReader, inFormat.withHeader());
    Map<String, Integer> headers = parser.getHeaderMap();
    int i = 0;
    for (String varName : headers.keySet()) {
        if (varName == null || varName.isEmpty()) {
            // -- L.A. 4.0 alpha 1
            throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.invalidHeader"));
        }
        DataVariable dv = new DataVariable();
        dv.setName(varName);
        dv.setLabel(varName);
        dv.setInvalidRanges(new ArrayList<>());
        dv.setSummaryStatistics(new ArrayList<>());
        dv.setUnf("UNF:6:NOTCALCULATED");
        dv.setCategories(new ArrayList<>());
        variableList.add(dv);
        dv.setTypeCharacter();
        dv.setIntervalDiscrete();
        dv.setFileOrder(i);
        dv.setDataTable(dataTable);
        i++;
    }
    dataTable.setVarQuantity((long) variableList.size());
    dataTable.setDataVariables(variableList);
    boolean[] isNumericVariable = new boolean[headers.size()];
    boolean[] isIntegerVariable = new boolean[headers.size()];
    boolean[] isTimeVariable = new boolean[headers.size()];
    boolean[] isDateVariable = new boolean[headers.size()];
    for (i = 0; i < headers.size(); i++) {
        // OK, let's assume that every variable is numeric;
        // but we'll go through the file and examine every value; the
        // moment we find a value that's not a legit numeric one, we'll
        // assume that it is in fact a String.
        isNumericVariable[i] = true;
        isIntegerVariable[i] = true;
        isDateVariable[i] = true;
        isTimeVariable[i] = true;
    }
    // First, "learning" pass.
    // (we'll save the incoming stream in another temp file:)
    SimpleDateFormat[] selectedDateTimeFormat = new SimpleDateFormat[headers.size()];
    SimpleDateFormat[] selectedDateFormat = new SimpleDateFormat[headers.size()];
    File firstPassTempFile = File.createTempFile("firstpass-", ".csv");
    try (CSVPrinter csvFilePrinter = new CSVPrinter(// TODO allow other parsers of tabular data to use this parser by changin inFormat
    new FileWriter(firstPassTempFile.getAbsolutePath()), inFormat)) {
        // Write  headers
        csvFilePrinter.printRecord(headers.keySet());
        for (CSVRecord record : parser.getRecords()) {
            // Checks if #records = #columns in header
            if (!record.isConsistent()) {
                List<String> args = Arrays.asList(new String[] { "" + (parser.getCurrentLineNumber() - 1), "" + headers.size(), "" + record.size() });
                throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.recordMismatch", args));
            }
            for (i = 0; i < headers.size(); i++) {
                String varString = record.get(i);
                isIntegerVariable[i] = isIntegerVariable[i] && varString != null && (varString.isEmpty() || varString.equals("null") || (firstNumCharSet.contains(varString.charAt(0)) && StringUtils.isNumeric(varString.substring(1))));
                if (isNumericVariable[i]) {
                    // If variable might be "numeric" test to see if this value is a parsable number:
                    if (varString != null && !varString.isEmpty()) {
                        boolean isNumeric = false;
                        boolean isInteger = false;
                        if (varString.equalsIgnoreCase("NaN") || varString.equalsIgnoreCase("NA") || varString.equalsIgnoreCase("Inf") || varString.equalsIgnoreCase("+Inf") || varString.equalsIgnoreCase("-Inf") || varString.equalsIgnoreCase("null")) {
                            continue;
                        } else {
                            try {
                                Double testDoubleValue = new Double(varString);
                                continue;
                            } catch (NumberFormatException ex) {
                            // the token failed to parse as a double
                            // so the column is a string variable.
                            }
                        }
                        isNumericVariable[i] = false;
                    }
                }
                // by parsing the cell as a date or date-time value:
                if (!isNumericVariable[i]) {
                    Date dateResult = null;
                    if (isTimeVariable[i]) {
                        if (varString != null && !varString.isEmpty()) {
                            boolean isTime = false;
                            if (selectedDateTimeFormat[i] != null) {
                                ParsePosition pos = new ParsePosition(0);
                                dateResult = selectedDateTimeFormat[i].parse(varString, pos);
                                if (dateResult != null && pos.getIndex() == varString.length()) {
                                    // OK, successfully parsed a value!
                                    isTime = true;
                                }
                            } else {
                                for (SimpleDateFormat format : TIME_FORMATS) {
                                    ParsePosition pos = new ParsePosition(0);
                                    dateResult = format.parse(varString, pos);
                                    if (dateResult != null && pos.getIndex() == varString.length()) {
                                        // OK, successfully parsed a value!
                                        isTime = true;
                                        selectedDateTimeFormat[i] = format;
                                        break;
                                    }
                                }
                            }
                            if (!isTime) {
                                isTimeVariable[i] = false;
                            // if the token didn't parse as a time value,
                            // we will still try to parse it as a date, below.
                            // unless this column is NOT a date.
                            } else {
                                // And if it is a time value, we are going to assume it's
                                // NOT a date.
                                isDateVariable[i] = false;
                            }
                        }
                    }
                    if (isDateVariable[i]) {
                        if (varString != null && !varString.isEmpty()) {
                            boolean isDate = false;
                            // -- L.A. 4.0 beta
                            for (SimpleDateFormat format : DATE_FORMATS) {
                                // Strict parsing - it will throw an
                                // exception if it doesn't parse!
                                format.setLenient(false);
                                try {
                                    format.parse(varString);
                                    isDate = true;
                                    selectedDateFormat[i] = format;
                                    break;
                                } catch (ParseException ex) {
                                // Do nothing
                                }
                            }
                            isDateVariable[i] = isDate;
                        }
                    }
                }
            }
            csvFilePrinter.printRecord(record);
        }
    }
    dataTable.setCaseQuantity(parser.getRecordNumber());
    parser.close();
    csvReader.close();
    // Re-type the variables that we've determined are numerics:
    for (i = 0; i < headers.size(); i++) {
        if (isNumericVariable[i]) {
            dataTable.getDataVariables().get(i).setTypeNumeric();
            if (isIntegerVariable[i]) {
                dataTable.getDataVariables().get(i).setIntervalDiscrete();
            } else {
                dataTable.getDataVariables().get(i).setIntervalContinuous();
            }
        } else if (isDateVariable[i] && selectedDateFormat[i] != null) {
            // Dates are still Strings, i.e., they are "character" and "discrete";
            // But we add special format values for them:
            dataTable.getDataVariables().get(i).setFormat(DATE_FORMATS[0].toPattern());
            dataTable.getDataVariables().get(i).setFormatCategory("date");
        } else if (isTimeVariable[i] && selectedDateTimeFormat[i] != null) {
            // Same for time values:
            dataTable.getDataVariables().get(i).setFormat(selectedDateTimeFormat[i].toPattern());
            dataTable.getDataVariables().get(i).setFormatCategory("time");
        }
    }
    // Second, final pass.
    try (BufferedReader secondPassReader = new BufferedReader(new FileReader(firstPassTempFile))) {
        parser = new CSVParser(secondPassReader, inFormat.withHeader());
        String[] caseRow = new String[headers.size()];
        for (CSVRecord record : parser) {
            if (!record.isConsistent()) {
                List<String> args = Arrays.asList(new String[] { "" + (parser.getCurrentLineNumber() - 1), "" + headers.size(), "" + record.size() });
                throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.recordMismatch", args));
            }
            for (i = 0; i < headers.size(); i++) {
                String varString = record.get(i);
                if (isNumericVariable[i]) {
                    if (varString == null || varString.isEmpty() || varString.equalsIgnoreCase("NA")) {
                        // Missing value - represented as an empty string in
                        // the final tab file
                        caseRow[i] = "";
                    } else if (varString.equalsIgnoreCase("NaN")) {
                        // "Not a Number" special value:
                        caseRow[i] = "NaN";
                    } else if (varString.equalsIgnoreCase("Inf") || varString.equalsIgnoreCase("+Inf")) {
                        // Positive infinity:
                        caseRow[i] = "Inf";
                    } else if (varString.equalsIgnoreCase("-Inf")) {
                        // Negative infinity:
                        caseRow[i] = "-Inf";
                    } else if (varString.equalsIgnoreCase("null")) {
                        // By request from Gus - "NULL" is recognized as a
                        // numeric zero:
                        caseRow[i] = isIntegerVariable[i] ? "0" : "0.0";
                    } else {
                        /* No re-formatting is done on any other numeric values.
                             * We'll save them as they were, for archival purposes.
                             * The alternative solution - formatting in sci. notation
                             * is commented-out below.
                             */
                        caseRow[i] = varString;
                    /*
                             if (isIntegerVariable[i]) {
                                try {
                                    Integer testIntegerValue = new Integer(varString);
                                    caseRow[i] = testIntegerValue.toString();
                                } catch (NumberFormatException ex) {
                                    throw new IOException("Failed to parse a value recognized as an integer in the first pass! (?)");
                                }
                            } else {
                                try {
                                    Double testDoubleValue = new Double(varString);
                                    if (testDoubleValue.equals(0.0)) {
                                        caseRow[i] = "0.0";
                                    } else {
                                                                            // One possible implementation:
                                        //
                                        // Round our fractional values to 15 digits
                                        // (minimum number of digits of precision guaranteed by
                                        // type Double) and format the resulting representations
                                        // in a IEEE 754-like "scientific notation" - for ex.,
                                        // 753.24 will be encoded as 7.5324e2
                                        BigDecimal testBigDecimal = new BigDecimal(varString, doubleMathContext);
                                        caseRow[i] = String.format(FORMAT_IEEE754, testBigDecimal);

                                        // Strip meaningless zeros and extra + signs:
                                        caseRow[i] = caseRow[i].replaceFirst("00*e", "e");
                                        caseRow[i] = caseRow[i].replaceFirst("\\.e", ".0e");
                                        caseRow[i] = caseRow[i].replaceFirst("e\\+00", "");
                                        caseRow[i] = caseRow[i].replaceFirst("^\\+", "");
                                    }
                                } catch (NumberFormatException ex) {
                                    throw new IOException("Failed to parse a value recognized as numeric in the first pass! (?)");
                                }
                            }
                             */
                    }
                } else if (isTimeVariable[i] || isDateVariable[i]) {
                    // Time and Dates are stored NOT quoted (don't ask).
                    if (varString != null) {
                        // Dealing with quotes:
                        // remove the leading and trailing quotes, if present:
                        varString = varString.replaceFirst("^\"*", "");
                        varString = varString.replaceFirst("\"*$", "");
                        caseRow[i] = varString;
                    } else {
                        caseRow[i] = "";
                    }
                } else {
                    // indeed empty strings, and NOT missing values:
                    if (varString != null) {
                        // escape the quotes, newlines, and tabs:
                        varString = varString.replace("\"", "\\\"");
                        varString = varString.replace("\n", "\\n");
                        varString = varString.replace("\t", "\\t");
                        // final pair of quotes:
                        varString = "\"" + varString + "\"";
                        caseRow[i] = varString;
                    } else {
                        caseRow[i] = "\"\"";
                    }
                }
            }
            finalOut.println(StringUtils.join(caseRow, "\t"));
        }
    }
    long linecount = parser.getRecordNumber();
    finalOut.close();
    parser.close();
    dbglog.fine("Tmp File: " + firstPassTempFile);
    // Firstpass file is deleted to prevent tmp from filling up.
    firstPassTempFile.delete();
    if (dataTable.getCaseQuantity().intValue() != linecount) {
        List<String> args = Arrays.asList(new String[] { "" + dataTable.getCaseQuantity().intValue(), "" + linecount });
        throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.line_mismatch", args));
    }
    return (int) linecount;
}
Also used : FileWriter(java.io.FileWriter) ArrayList(java.util.ArrayList) DataVariable(edu.harvard.iq.dataverse.datavariable.DataVariable) CSVPrinter(org.apache.commons.csv.CSVPrinter) TabularDataFileReader(edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader) FileReader(java.io.FileReader) ParsePosition(java.text.ParsePosition) IOException(java.io.IOException) Date(java.util.Date) CSVParser(org.apache.commons.csv.CSVParser) BufferedReader(java.io.BufferedReader) CSVRecord(org.apache.commons.csv.CSVRecord) ParseException(java.text.ParseException) SimpleDateFormat(java.text.SimpleDateFormat) File(java.io.File)

Example 19 with DataVariable

use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.

the class RJobRequest method getVariableTypes.

/**
 * getVariableTypes()
 * @return    An array of variable types(0, 1, 2, 3)
 * (3 is for Boolean)
 */
public int[] getVariableTypes() {
    List<Integer> rw = new ArrayList<>();
    for (DataVariable dv : dataVariablesForRequest) {
        if (!StringUtils.isEmpty(dv.getFormatCategory())) {
            if (dv.getFormatCategory().toLowerCase().equals("date") || (dv.getFormatCategory().toLowerCase().equals("time"))) {
                rw.add(0);
            } else if (dv.getFormatCategory().equals("Boolean")) {
                rw.add(3);
            } else {
                if (dv.isTypeNumeric()) {
                    if (dv.getInterval() == null) {
                        rw.add(2);
                    } else {
                        if (dv.isIntervalContinuous()) {
                            rw.add(2);
                        } else {
                            rw.add(1);
                        }
                    }
                } else if (dv.isTypeCharacter()) {
                    rw.add(0);
                }
            }
        } else {
            if (dv.isTypeNumeric()) {
                if (dv.getInterval() == null) {
                    rw.add(2);
                } else {
                    if (dv.isIntervalContinuous()) {
                        rw.add(2);
                    } else {
                        rw.add(1);
                    }
                }
            } else if (dv.isTypeCharacter()) {
                rw.add(0);
            }
        }
    }
    Integer[] tmp = rw.toArray(new Integer[rw.size()]);
    dbgLog.fine("vartype=" + StringUtils.join(tmp, ", "));
    int[] variableTypes = new int[tmp.length];
    for (int j = 0; j < tmp.length; j++) {
        variableTypes[j] = tmp[j];
    }
    return variableTypes;
}
Also used : DataVariable(edu.harvard.iq.dataverse.datavariable.DataVariable)

Example 20 with DataVariable

use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.

the class RJobRequest method getVariableNames.

public String[] getVariableNames() {
    String[] variableNames = null;
    List<String> rw = new ArrayList<>();
    for (DataVariable dv : dataVariablesForRequest) {
        rw.add(dv.getName());
    }
    variableNames = rw.toArray(new String[rw.size()]);
    return variableNames;
}
Also used : DataVariable(edu.harvard.iq.dataverse.datavariable.DataVariable)

Aggregations

DataVariable (edu.harvard.iq.dataverse.datavariable.DataVariable)25 DataFile (edu.harvard.iq.dataverse.DataFile)8 IOException (java.io.IOException)6 ArrayList (java.util.ArrayList)5 Dataset (edu.harvard.iq.dataverse.Dataset)4 Dataverse (edu.harvard.iq.dataverse.Dataverse)4 FileInputStream (java.io.FileInputStream)4 FileMetadata (edu.harvard.iq.dataverse.FileMetadata)3 VariableCategory (edu.harvard.iq.dataverse.datavariable.VariableCategory)3 File (java.io.File)3 FileNotFoundException (java.io.FileNotFoundException)3 InputStream (java.io.InputStream)3 DataTable (edu.harvard.iq.dataverse.DataTable)2 SummaryStatistic (edu.harvard.iq.dataverse.datavariable.SummaryStatistic)2 VariableRange (edu.harvard.iq.dataverse.datavariable.VariableRange)2 NoSuchAlgorithmException (java.security.NoSuchAlgorithmException)2 SimpleDateFormat (java.text.SimpleDateFormat)2 Date (java.util.Date)2 HashMap (java.util.HashMap)2 LinkedHashMap (java.util.LinkedHashMap)2