use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.
the class DDIExportServiceBean method createDataFileDDI.
private void createDataFileDDI(XMLStreamWriter xmlw, Set<String> excludedFieldSet, Set<String> includedFieldSet, DataFile df) throws XMLStreamException {
/* This method will create both the <fileDscr> and <dataDscr><var>
* portions of the DDI that describe the tabular data contained in
* the file, the file-, datatable- and variable-level metadata; or
* a subset of the above, as defined by the "include" and "exclude"
* parameters.
*/
/*
* This method is only called when an /api/meta/file request comes
* in; i.e., for a study export, createFileDscr and createData/createVar
* methods will be called separately. So we need to create the top-level
* ddi (<codeBook>) tag header:
*/
xmlw.writeStartElement("codeBook");
xmlw.writeDefaultNamespace("http://www.icpsr.umich.edu/DDI");
writeAttribute(xmlw, "version", "2.0");
createStdyDscr(xmlw, excludedFieldSet, includedFieldSet, df.getOwner().getLatestVersion());
DataTable dt = fileService.findDataTableByFileId(df.getId());
if (checkField("fileDscr", excludedFieldSet, includedFieldSet)) {
createFileDscr(xmlw, excludedFieldSet, null, df, dt);
}
// And now, the variables:
xmlw.writeStartElement("dataDscr");
if (checkField("var", excludedFieldSet, includedFieldSet)) {
List<DataVariable> vars = variableService.findByDataTableId(dt.getId());
for (DataVariable var : vars) {
createVarDDI(xmlw, excludedFieldSet, null, var);
}
}
// dataDscr
xmlw.writeEndElement();
// codeBook
xmlw.writeEndElement();
}
use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.
the class DDIExportServiceBean method createDatasetDDI.
private void createDatasetDDI(XMLStreamWriter xmlw, Set<String> excludedFieldSet, Set<String> includedFieldSet, DatasetVersion version) throws XMLStreamException {
xmlw.writeStartElement("codeBook");
xmlw.writeDefaultNamespace("http://www.icpsr.umich.edu/DDI");
writeAttribute(xmlw, "version", "2.0");
createStdyDscr(xmlw, excludedFieldSet, includedFieldSet, version);
// Files:
List<FileMetadata> tabularDataFiles = new ArrayList<>();
List<FileMetadata> otherDataFiles = new ArrayList<>();
List<FileMetadata> fileMetadatas = version.getFileMetadatas();
if (fileMetadatas == null || fileMetadatas.isEmpty()) {
// codeBook
xmlw.writeEndElement();
return;
}
for (FileMetadata fileMetadata : fileMetadatas) {
if (fileMetadata.getDataFile().isTabularData()) {
tabularDataFiles.add(fileMetadata);
} else {
otherDataFiles.add(fileMetadata);
}
}
if (checkField("fileDscr", excludedFieldSet, includedFieldSet)) {
for (FileMetadata fileMetadata : tabularDataFiles) {
DataTable dt = fileService.findDataTableByFileId(fileMetadata.getDataFile().getId());
createFileDscr(xmlw, excludedFieldSet, includedFieldSet, fileMetadata.getDataFile(), dt);
}
// 2nd pass, to create data (variable) description sections:
xmlw.writeStartElement("dataDscr");
for (FileMetadata fileMetadata : tabularDataFiles) {
DataTable dt = fileService.findDataTableByFileId(fileMetadata.getDataFile().getId());
List<DataVariable> vars = variableService.findByDataTableId(dt.getId());
for (DataVariable var : vars) {
createVarDDI(xmlw, excludedFieldSet, null, var);
}
}
// dataDscr
xmlw.writeEndElement();
}
if (checkField("othrMat", excludedFieldSet, includedFieldSet)) {
for (FileMetadata fileMetadata : otherDataFiles) {
createOtherMat(xmlw, excludedFieldSet, includedFieldSet, fileMetadata);
}
}
// codeBook
xmlw.writeEndElement();
}
use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.
the class CSVFileReader method readFile.
public int readFile(BufferedReader csvReader, DataTable dataTable, PrintWriter finalOut) throws IOException {
List<DataVariable> variableList = new ArrayList<>();
CSVParser parser = new CSVParser(csvReader, inFormat.withHeader());
Map<String, Integer> headers = parser.getHeaderMap();
int i = 0;
for (String varName : headers.keySet()) {
if (varName == null || varName.isEmpty()) {
// -- L.A. 4.0 alpha 1
throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.invalidHeader"));
}
DataVariable dv = new DataVariable();
dv.setName(varName);
dv.setLabel(varName);
dv.setInvalidRanges(new ArrayList<>());
dv.setSummaryStatistics(new ArrayList<>());
dv.setUnf("UNF:6:NOTCALCULATED");
dv.setCategories(new ArrayList<>());
variableList.add(dv);
dv.setTypeCharacter();
dv.setIntervalDiscrete();
dv.setFileOrder(i);
dv.setDataTable(dataTable);
i++;
}
dataTable.setVarQuantity((long) variableList.size());
dataTable.setDataVariables(variableList);
boolean[] isNumericVariable = new boolean[headers.size()];
boolean[] isIntegerVariable = new boolean[headers.size()];
boolean[] isTimeVariable = new boolean[headers.size()];
boolean[] isDateVariable = new boolean[headers.size()];
for (i = 0; i < headers.size(); i++) {
// OK, let's assume that every variable is numeric;
// but we'll go through the file and examine every value; the
// moment we find a value that's not a legit numeric one, we'll
// assume that it is in fact a String.
isNumericVariable[i] = true;
isIntegerVariable[i] = true;
isDateVariable[i] = true;
isTimeVariable[i] = true;
}
// First, "learning" pass.
// (we'll save the incoming stream in another temp file:)
SimpleDateFormat[] selectedDateTimeFormat = new SimpleDateFormat[headers.size()];
SimpleDateFormat[] selectedDateFormat = new SimpleDateFormat[headers.size()];
File firstPassTempFile = File.createTempFile("firstpass-", ".csv");
try (CSVPrinter csvFilePrinter = new CSVPrinter(// TODO allow other parsers of tabular data to use this parser by changin inFormat
new FileWriter(firstPassTempFile.getAbsolutePath()), inFormat)) {
// Write headers
csvFilePrinter.printRecord(headers.keySet());
for (CSVRecord record : parser.getRecords()) {
// Checks if #records = #columns in header
if (!record.isConsistent()) {
List<String> args = Arrays.asList(new String[] { "" + (parser.getCurrentLineNumber() - 1), "" + headers.size(), "" + record.size() });
throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.recordMismatch", args));
}
for (i = 0; i < headers.size(); i++) {
String varString = record.get(i);
isIntegerVariable[i] = isIntegerVariable[i] && varString != null && (varString.isEmpty() || varString.equals("null") || (firstNumCharSet.contains(varString.charAt(0)) && StringUtils.isNumeric(varString.substring(1))));
if (isNumericVariable[i]) {
// If variable might be "numeric" test to see if this value is a parsable number:
if (varString != null && !varString.isEmpty()) {
boolean isNumeric = false;
boolean isInteger = false;
if (varString.equalsIgnoreCase("NaN") || varString.equalsIgnoreCase("NA") || varString.equalsIgnoreCase("Inf") || varString.equalsIgnoreCase("+Inf") || varString.equalsIgnoreCase("-Inf") || varString.equalsIgnoreCase("null")) {
continue;
} else {
try {
Double testDoubleValue = new Double(varString);
continue;
} catch (NumberFormatException ex) {
// the token failed to parse as a double
// so the column is a string variable.
}
}
isNumericVariable[i] = false;
}
}
// by parsing the cell as a date or date-time value:
if (!isNumericVariable[i]) {
Date dateResult = null;
if (isTimeVariable[i]) {
if (varString != null && !varString.isEmpty()) {
boolean isTime = false;
if (selectedDateTimeFormat[i] != null) {
ParsePosition pos = new ParsePosition(0);
dateResult = selectedDateTimeFormat[i].parse(varString, pos);
if (dateResult != null && pos.getIndex() == varString.length()) {
// OK, successfully parsed a value!
isTime = true;
}
} else {
for (SimpleDateFormat format : TIME_FORMATS) {
ParsePosition pos = new ParsePosition(0);
dateResult = format.parse(varString, pos);
if (dateResult != null && pos.getIndex() == varString.length()) {
// OK, successfully parsed a value!
isTime = true;
selectedDateTimeFormat[i] = format;
break;
}
}
}
if (!isTime) {
isTimeVariable[i] = false;
// if the token didn't parse as a time value,
// we will still try to parse it as a date, below.
// unless this column is NOT a date.
} else {
// And if it is a time value, we are going to assume it's
// NOT a date.
isDateVariable[i] = false;
}
}
}
if (isDateVariable[i]) {
if (varString != null && !varString.isEmpty()) {
boolean isDate = false;
// -- L.A. 4.0 beta
for (SimpleDateFormat format : DATE_FORMATS) {
// Strict parsing - it will throw an
// exception if it doesn't parse!
format.setLenient(false);
try {
format.parse(varString);
isDate = true;
selectedDateFormat[i] = format;
break;
} catch (ParseException ex) {
// Do nothing
}
}
isDateVariable[i] = isDate;
}
}
}
}
csvFilePrinter.printRecord(record);
}
}
dataTable.setCaseQuantity(parser.getRecordNumber());
parser.close();
csvReader.close();
// Re-type the variables that we've determined are numerics:
for (i = 0; i < headers.size(); i++) {
if (isNumericVariable[i]) {
dataTable.getDataVariables().get(i).setTypeNumeric();
if (isIntegerVariable[i]) {
dataTable.getDataVariables().get(i).setIntervalDiscrete();
} else {
dataTable.getDataVariables().get(i).setIntervalContinuous();
}
} else if (isDateVariable[i] && selectedDateFormat[i] != null) {
// Dates are still Strings, i.e., they are "character" and "discrete";
// But we add special format values for them:
dataTable.getDataVariables().get(i).setFormat(DATE_FORMATS[0].toPattern());
dataTable.getDataVariables().get(i).setFormatCategory("date");
} else if (isTimeVariable[i] && selectedDateTimeFormat[i] != null) {
// Same for time values:
dataTable.getDataVariables().get(i).setFormat(selectedDateTimeFormat[i].toPattern());
dataTable.getDataVariables().get(i).setFormatCategory("time");
}
}
// Second, final pass.
try (BufferedReader secondPassReader = new BufferedReader(new FileReader(firstPassTempFile))) {
parser = new CSVParser(secondPassReader, inFormat.withHeader());
String[] caseRow = new String[headers.size()];
for (CSVRecord record : parser) {
if (!record.isConsistent()) {
List<String> args = Arrays.asList(new String[] { "" + (parser.getCurrentLineNumber() - 1), "" + headers.size(), "" + record.size() });
throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.recordMismatch", args));
}
for (i = 0; i < headers.size(); i++) {
String varString = record.get(i);
if (isNumericVariable[i]) {
if (varString == null || varString.isEmpty() || varString.equalsIgnoreCase("NA")) {
// Missing value - represented as an empty string in
// the final tab file
caseRow[i] = "";
} else if (varString.equalsIgnoreCase("NaN")) {
// "Not a Number" special value:
caseRow[i] = "NaN";
} else if (varString.equalsIgnoreCase("Inf") || varString.equalsIgnoreCase("+Inf")) {
// Positive infinity:
caseRow[i] = "Inf";
} else if (varString.equalsIgnoreCase("-Inf")) {
// Negative infinity:
caseRow[i] = "-Inf";
} else if (varString.equalsIgnoreCase("null")) {
// By request from Gus - "NULL" is recognized as a
// numeric zero:
caseRow[i] = isIntegerVariable[i] ? "0" : "0.0";
} else {
/* No re-formatting is done on any other numeric values.
* We'll save them as they were, for archival purposes.
* The alternative solution - formatting in sci. notation
* is commented-out below.
*/
caseRow[i] = varString;
/*
if (isIntegerVariable[i]) {
try {
Integer testIntegerValue = new Integer(varString);
caseRow[i] = testIntegerValue.toString();
} catch (NumberFormatException ex) {
throw new IOException("Failed to parse a value recognized as an integer in the first pass! (?)");
}
} else {
try {
Double testDoubleValue = new Double(varString);
if (testDoubleValue.equals(0.0)) {
caseRow[i] = "0.0";
} else {
// One possible implementation:
//
// Round our fractional values to 15 digits
// (minimum number of digits of precision guaranteed by
// type Double) and format the resulting representations
// in a IEEE 754-like "scientific notation" - for ex.,
// 753.24 will be encoded as 7.5324e2
BigDecimal testBigDecimal = new BigDecimal(varString, doubleMathContext);
caseRow[i] = String.format(FORMAT_IEEE754, testBigDecimal);
// Strip meaningless zeros and extra + signs:
caseRow[i] = caseRow[i].replaceFirst("00*e", "e");
caseRow[i] = caseRow[i].replaceFirst("\\.e", ".0e");
caseRow[i] = caseRow[i].replaceFirst("e\\+00", "");
caseRow[i] = caseRow[i].replaceFirst("^\\+", "");
}
} catch (NumberFormatException ex) {
throw new IOException("Failed to parse a value recognized as numeric in the first pass! (?)");
}
}
*/
}
} else if (isTimeVariable[i] || isDateVariable[i]) {
// Time and Dates are stored NOT quoted (don't ask).
if (varString != null) {
// Dealing with quotes:
// remove the leading and trailing quotes, if present:
varString = varString.replaceFirst("^\"*", "");
varString = varString.replaceFirst("\"*$", "");
caseRow[i] = varString;
} else {
caseRow[i] = "";
}
} else {
// indeed empty strings, and NOT missing values:
if (varString != null) {
// escape the quotes, newlines, and tabs:
varString = varString.replace("\"", "\\\"");
varString = varString.replace("\n", "\\n");
varString = varString.replace("\t", "\\t");
// final pair of quotes:
varString = "\"" + varString + "\"";
caseRow[i] = varString;
} else {
caseRow[i] = "\"\"";
}
}
}
finalOut.println(StringUtils.join(caseRow, "\t"));
}
}
long linecount = parser.getRecordNumber();
finalOut.close();
parser.close();
dbglog.fine("Tmp File: " + firstPassTempFile);
// Firstpass file is deleted to prevent tmp from filling up.
firstPassTempFile.delete();
if (dataTable.getCaseQuantity().intValue() != linecount) {
List<String> args = Arrays.asList(new String[] { "" + dataTable.getCaseQuantity().intValue(), "" + linecount });
throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.line_mismatch", args));
}
return (int) linecount;
}
use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.
the class RJobRequest method getVariableTypes.
/**
* getVariableTypes()
* @return An array of variable types(0, 1, 2, 3)
* (3 is for Boolean)
*/
public int[] getVariableTypes() {
List<Integer> rw = new ArrayList<>();
for (DataVariable dv : dataVariablesForRequest) {
if (!StringUtils.isEmpty(dv.getFormatCategory())) {
if (dv.getFormatCategory().toLowerCase().equals("date") || (dv.getFormatCategory().toLowerCase().equals("time"))) {
rw.add(0);
} else if (dv.getFormatCategory().equals("Boolean")) {
rw.add(3);
} else {
if (dv.isTypeNumeric()) {
if (dv.getInterval() == null) {
rw.add(2);
} else {
if (dv.isIntervalContinuous()) {
rw.add(2);
} else {
rw.add(1);
}
}
} else if (dv.isTypeCharacter()) {
rw.add(0);
}
}
} else {
if (dv.isTypeNumeric()) {
if (dv.getInterval() == null) {
rw.add(2);
} else {
if (dv.isIntervalContinuous()) {
rw.add(2);
} else {
rw.add(1);
}
}
} else if (dv.isTypeCharacter()) {
rw.add(0);
}
}
}
Integer[] tmp = rw.toArray(new Integer[rw.size()]);
dbgLog.fine("vartype=" + StringUtils.join(tmp, ", "));
int[] variableTypes = new int[tmp.length];
for (int j = 0; j < tmp.length; j++) {
variableTypes[j] = tmp[j];
}
return variableTypes;
}
use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.
the class RJobRequest method getVariableNames.
public String[] getVariableNames() {
String[] variableNames = null;
List<String> rw = new ArrayList<>();
for (DataVariable dv : dataVariablesForRequest) {
rw.add(dv.getName());
}
variableNames = rw.toArray(new String[rw.size()]);
return variableNames;
}
Aggregations