use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.
the class PORFileReader method read.
@Override
public TabularDataIngest read(BufferedInputStream stream, File additionalData) throws IOException {
dbgLog.fine("PORFileReader: read() start");
if (additionalData != null) {
// throw new IOException ("this plugin does not support external raw data files");
dbgLog.fine("Using extended variable labels from file " + additionalData.getName());
extendedLabels = createLabelMap(additionalData);
}
File tempPORfile = decodeHeader(stream);
BufferedReader bfReader = null;
try {
bfReader = new BufferedReader(new InputStreamReader(new FileInputStream(tempPORfile.getAbsolutePath()), "US-ASCII"));
if (bfReader == null) {
dbgLog.fine("bfReader is null");
throw new IOException("bufferedReader is null");
}
decodeSec2(bfReader);
while (true) {
// 1 byte
char[] header = new char[LENGTH_SECTION_HEADER];
bfReader.read(header);
String headerId = Character.toString(header[0]);
dbgLog.fine("////////////////////// headerId=" + headerId + "//////////////////////");
if (headerId.equals("Z")) {
throw new IOException("reading failure: wrong headerId(Z) here");
}
if (headerId.equals("F")) {
// missing value
if ((missingValueTable != null) && (missingValueTable.size() > 0)) {
processMissingValueData();
}
}
if (headerId.equals("8") && isCurrentVariableString) {
headerId = "8S";
}
decode(headerId, bfReader);
// for last iteration
if (headerId.equals("F")) {
// without reaching the end of this file.
break;
}
}
} finally {
try {
if (bfReader != null) {
bfReader.close();
}
} catch (IOException ex) {
ex.printStackTrace();
}
if (tempPORfile.exists()) {
tempPORfile.delete();
}
}
dbgLog.fine("done parsing headers and decoding;");
List<DataVariable> variableList = new ArrayList<>();
for (int indx = 0; indx < variableTypelList.size(); indx++) {
DataVariable dv = new DataVariable();
String varName = variableNameList.get(indx);
dv.setName(varName);
String varLabel = variableLabelMap.get(varName);
if (varLabel != null && varLabel.length() > 255) {
varLabel = varLabel.substring(0, 255);
}
// -- L.A. 4.0, beta11
if (extendedLabels != null && extendedLabels.get(varName) != null) {
dv.setLabel(extendedLabels.get(varName));
} else {
dv.setLabel(varLabel);
}
dv.setInvalidRanges(new ArrayList<>());
dv.setSummaryStatistics(new ArrayList<>());
dv.setUnf("UNF:6:");
dv.setCategories(new ArrayList<>());
dv.setFileOrder(indx);
dv.setDataTable(dataTable);
variableList.add(dv);
int simpleType = 0;
if (variableTypelList.get(indx) != null) {
simpleType = variableTypelList.get(indx);
}
if (simpleType <= 0) {
// We need to make one last type adjustment:
// Dates and Times will be stored as character values in the
// dataverse tab files; even though they are not typed as
// strings at this point:
// TODO:
// Make sure the date/time format is properly preserved!
// (see the setFormatCategory below... but double-check!)
// -- L.A. 4.0 alpha
String variableFormatType = variableFormatTypeList[indx];
if (variableFormatType != null) {
if (variableFormatType.equals("time") || variableFormatType.equals("date")) {
simpleType = 1;
String formatCategory = formatCategoryTable.get(varName);
if (formatCategory != null) {
if (dateFormatList[indx] != null) {
dbgLog.fine("setting format category to " + formatCategory);
variableList.get(indx).setFormatCategory(formatCategory);
dbgLog.fine("setting formatschemaname to " + dateFormatList[indx]);
variableList.get(indx).setFormat(dateFormatList[indx]);
}
}
} else if (variableFormatType.equals("other")) {
dbgLog.fine("Variable of format type \"other\"; type adjustment may be needed");
dbgLog.fine("SPSS print format: " + printFormatTable.get(variableList.get(indx).getName()));
if (printFormatTable.get(variableList.get(indx).getName()).equals("WKDAY") || printFormatTable.get(variableList.get(indx).getName()).equals("MONTH")) {
// week day or month;
// These are not treated as time/date values (meaning, we
// don't define time/date formats for them; there's likely
// no valid ISO time/date format for just a month or a day
// of week). However, the
// values will be stored in the TAB files as strings,
// and not as numerics - as they were stored in the
// SAV file. So we need to adjust the type here.
// -- L.A.
simpleType = 1;
}
}
}
}
dbgLog.fine("Finished creating variable " + indx + ", " + varName);
if (simpleType > 0) {
// String:
variableList.get(indx).setTypeCharacter();
variableList.get(indx).setIntervalDiscrete();
} else {
// Numeric:
variableList.get(indx).setTypeNumeric();
if (decimalVariableSet.contains(indx)) {
variableList.get(indx).setIntervalContinuous();
} else {
variableList.get(indx).setIntervalDiscrete();
}
}
dbgLog.fine("Finished configuring variable type information.");
}
dbgLog.fine("done configuring variables;");
/*
* From the original (3.6) code:
//smd.setVariableTypeMinimal(ArrayUtils.toPrimitive(variableTypelList.toArray(new Integer[variableTypelList.size()])));
smd.setVariableFormat(printFormatList);
smd.setVariableFormatName(printFormatNameTable);
smd.setVariableFormatCategory(formatCategoryTable);
smd.setValueLabelMappingTable(valueVariableMappingTable);
* TODO:
* double-check that it's all being taken care of by the new plugin!
* (for variable format and formatName, consult the SAV plugin)
*/
dataTable.setDataVariables(variableList);
// Assign value labels:
assignValueLabels(valueLabelTable);
ingesteddata.setDataTable(dataTable);
dbgLog.info("PORFileReader: read() end");
return ingesteddata;
}
use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.
the class RDATAFileReader method getDataFrameInformation.
/**
* Runs an R-script that extracts meta-data from the *original* Rdata
* object, then parses its output and creates DataVariable objects.
*
* @throws IOException if something bad happens?
*/
private void getDataFrameInformation() {
LOG.fine("RDATAFileReader: Entering `getDataFrameInformation` function");
// Store variable names
String[] variableNames = {};
String parentDirectory = mRWorkspace.getRdataFile().getParent();
String fileInfoScript = new StringBuilder("").append(String.format("load(\"%s\")\n", mRWorkspace.getRdataAbsolutePath())).append(String.format("setwd(\"%s\")\n", parentDirectory)).append(RSCRIPT_GET_DATASET).append("\n").append(RSCRIPT_DATASET_INFO_SCRIPT).toString();
try {
RRequest request = mRequestBuilder.build();
request.script(fileInfoScript);
RList fileInformation = request.eval().asList();
RList metaInfo = fileInformation.at("meta.info").asList();
int varQnty = 0;
variableNames = fileInformation.at("varNames").asStrings();
// mDataTypes = fileInformation.at("dataTypes").asStrings();
// Initialize variables:
List<DataVariable> variableList = new ArrayList<>();
for (String varName : variableNames) {
DataVariable dv = new DataVariable();
dv.setName(varName);
dv.setLabel(varName);
// TODO:
// Check if variables have real descriptive labels defined,
// via the mechanismm provided by that special optional package...
// (?) -- L.A.
dv.setInvalidRanges(new ArrayList<>());
dv.setSummaryStatistics(new ArrayList<>());
dv.setUnf("UNF:6:XYZXYZXYZ");
dv.setCategories(new ArrayList<>());
variableList.add(dv);
dv.setFileOrder(varQnty);
dv.setDataTable(dataTable);
// variableLabels.put(varName, varName);
// variableNameList.add(varName);
varQnty++;
}
dataTable.setVarQuantity(new Long(varQnty));
dataTable.setDataVariables(variableList);
// Get the Variable Meta Data Table while Populating
processVariableInfo(metaInfo, dataTable);
if (fileInformation.at("caseQnty") != null) {
int caseQuantity = 0;
try {
caseQuantity = fileInformation.at("caseQnty").asInteger();
} catch (REXPMismatchException rexp) {
// bummer! - but not fatal.
}
if (caseQuantity > 0) {
dataTable.setCaseQuantity(new Long(caseQuantity));
}
}
} catch (REXPMismatchException ex) {
LOG.warning("RDATAFileReader: Could not put information correctly");
} catch (Exception ex) {
ex.printStackTrace();
LOG.warning(ex.getMessage());
}
}
use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.
the class RTabFileParser method read.
// version of the read method that parses the CSV file and stores
// its content in the data table matrix (in memory).
// TODO: remove this method.
// Only the version that reads the file and stores it in a TAB file
// should be used.
public int read(BufferedReader csvReader, DataTable dataTable, PrintWriter pwout) throws IOException {
dbgLog.warning("RTabFileParser: Inside R Tab file parser");
int varQnty = 0;
try {
varQnty = dataTable.getVarQuantity().intValue();
} catch (Exception ex) {
// return -1;
throw new IOException("R Tab File Parser: Could not obtain varQnty from the dataset metadata.");
}
if (varQnty == 0) {
// return -1;
throw new IOException("R Tab File Parser: varQnty=0 in the dataset metadata!");
}
dbgLog.fine("CSV reader; varQnty: " + varQnty);
dbgLog.fine("CSV reader; delimiter: " + delimiterChar);
String[] caseRow = new String[varQnty];
String line;
String[] valueTokens;
int lineCounter = 0;
boolean[] isCharacterVariable = new boolean[varQnty];
boolean[] isContinuousVariable = new boolean[varQnty];
boolean[] isTimeVariable = new boolean[varQnty];
boolean[] isBooleanVariable = new boolean[varQnty];
if (dataTable.getDataVariables() != null) {
for (int i = 0; i < varQnty; i++) {
DataVariable var = dataTable.getDataVariables().get(i);
if (var == null) {
// throw exception!
}
if (var.getType() == null) {
// throw exception!
}
if (var.isTypeCharacter()) {
isCharacterVariable[i] = true;
isContinuousVariable[i] = false;
if (var.getFormatCategory() != null && (var.getFormatCategory().startsWith("date") || var.getFormatCategory().startsWith("time"))) {
isTimeVariable[i] = true;
}
} else if (var.isTypeNumeric()) {
isCharacterVariable[i] = false;
if (var.getInterval() == null) {
// throw exception!
}
if (var.isIntervalContinuous()) {
isContinuousVariable[i] = true;
} else {
// discrete by default:
isContinuousVariable[i] = false;
if (var.getFormatCategory() != null && var.getFormatCategory().equals("Boolean")) {
isBooleanVariable[i] = true;
}
}
} else {
// throw excepion "unknown variable format type" - ?
}
}
} else {
// throw exception!
}
while ((line = csvReader.readLine()) != null) {
// chop the line:
line = line.replaceFirst("[\r\n]*$", "");
valueTokens = line.split("" + delimiterChar, -2);
if (valueTokens == null) {
throw new IOException("Failed to read line " + (lineCounter + 1) + " of the Data file.");
}
if (valueTokens.length != varQnty) {
throw new IOException("Reading mismatch, line " + (lineCounter + 1) + " of the Data file: " + varQnty + " delimited values expected, " + valueTokens.length + " found.");
}
for (int i = 0; i < varQnty; i++) {
if (isCharacterVariable[i]) {
// Empty strings stored as " " (one white space):
if (valueTokens[i] != null && (!valueTokens[i].equals(""))) {
String charToken = valueTokens[i];
// Dealing with quotes:
// remove the leading and trailing quotes, if present:
charToken = charToken.replaceFirst("^\"", "");
charToken = charToken.replaceFirst("\"$", "");
// escape the remaining ones:
charToken = charToken.replace("\"", "\\\"");
// final pair of quotes:
if (isTimeVariable == null || (!isTimeVariable[i])) {
charToken = "\"" + charToken + "\"";
}
caseRow[i] = charToken;
} else {
// missing value:
caseRow[i] = "";
}
} else if (isContinuousVariable[i]) {
// Numeric, Double:
// This is the major case of special/custom processing,
// specific for R ingest. It was found to be impossible
// to write a numeric/continuous column into the tab file
// while unambiguously preserving both NA and NaNs, if both
// are present. At least, not if using the standard
// write.table function. So it seemed easier to treat this
// as a special case, rather than write our own write.table
// equivalent in R. On the R side, if any special values
// are present in the columns, the values will be
// converted into a character vector. The NAs and NaNs will
// be replaced with the character tokens "NA" and "NaN"
// respectively. Of course R will add double quotes around
// the tokens, hence the post-processing - we'll just need
// to remove all these quotes, and then we'll be fine.
dbgLog.fine("R Tab File Parser; double value: " + valueTokens[i]);
// Dealing with quotes:
// remove the leading and trailing quotes, if present:
valueTokens[i] = valueTokens[i].replaceFirst("^\"", "");
valueTokens[i] = valueTokens[i].replaceFirst("\"$", "");
if (valueTokens[i] != null && valueTokens[i].equalsIgnoreCase("NA")) {
caseRow[i] = "";
} else if (valueTokens[i] != null && valueTokens[i].equalsIgnoreCase("NaN")) {
caseRow[i] = "NaN";
} else if (valueTokens[i] != null && (valueTokens[i].equalsIgnoreCase("Inf") || valueTokens[i].equalsIgnoreCase("+Inf"))) {
caseRow[i] = "Inf";
} else if (valueTokens[i] != null && valueTokens[i].equalsIgnoreCase("-Inf")) {
caseRow[i] = "-Inf";
} else {
try {
Double testDoubleValue = new Double(valueTokens[i]);
// valueTokens[i];
caseRow[i] = testDoubleValue.toString();
} catch (Exception ex) {
dbgLog.fine("caught exception reading numeric value; variable: " + i + ", case: " + lineCounter + "; value: " + valueTokens[i]);
// dataTable[i][lineCounter] = (new Double(0)).toString();
caseRow[i] = "";
// TODO:
// decide if we should rather throw an exception and exit here;
// all the values in this file at this point must be
// legit numeric values (?) -- L.A.
}
}
} else if (isBooleanVariable[i]) {
if (valueTokens[i] != null) {
String charToken = valueTokens[i];
// remove the leading and trailing quotes, if present:
charToken = charToken.replaceFirst("^\"", "");
charToken = charToken.replaceFirst("\"$", "");
if (charToken.equals("FALSE")) {
caseRow[i] = "0";
} else if (charToken.equals("TRUE")) {
caseRow[i] = "1";
} else if (charToken.equals("")) {
// Legit case - Missing Value!
caseRow[i] = charToken;
} else {
throw new IOException("Unexpected value for the Boolean variable (" + i + "): " + charToken);
}
} else {
throw new IOException("Couldn't read Boolean variable (" + i + ")!");
}
} else {
// Numeric, Integer:
// One special case first: R NA (missing value) needs to be
// converted into the DVN's missing value - an empty String;
// (strictly speaking, this isn't necessary - an attempt to
// create an Integer object from the String "NA" would
// result in an exception, that would be intercepted below,
// with the same end result)
dbgLog.fine("R Tab File Parser; integer value: " + valueTokens[i]);
if (valueTokens[i] != null && valueTokens[i].equalsIgnoreCase("NA")) {
caseRow[i] = "";
} else {
try {
Integer testIntegerValue = new Integer(valueTokens[i]);
caseRow[i] = testIntegerValue.toString();
} catch (Exception ex) {
dbgLog.fine("caught exception reading numeric value; variable: " + i + ", case: " + lineCounter + "; value: " + valueTokens[i]);
// dataTable[i][lineCounter] = "0";
caseRow[i] = "";
}
}
}
}
pwout.println(StringUtils.join(caseRow, "\t"));
lineCounter++;
}
// csvData.setData(dataTable);
// return csvData;
pwout.close();
return lineCounter;
}
use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.
the class SAVFileReader method decodeRecordType2.
void decodeRecordType2(BufferedInputStream stream) throws IOException {
dbgLog.fine("decodeRecordType2(): start");
if (stream == null) {
throw new IllegalArgumentException("stream == null!");
}
Map<String, String> printFormatNameTable = new LinkedHashMap<String, String>();
Map<String, String> variableLabelMap = new LinkedHashMap<String, String>();
Map<String, List<String>> missingValueTable = new LinkedHashMap<String, List<String>>();
List<Integer> printFormatList = new ArrayList<Integer>();
String caseWeightVariableName = null;
int caseWeightVariableIndex = 0;
boolean lastVariableIsExtendable = false;
boolean extendedVariableMode = false;
boolean obs255 = false;
String lastVariableName = null;
String lastExtendedVariable = null;
// this field repeats as many as the number of variables in
// this sav file
// (note that the above statement is not technically correct, this
// record repeats not just for every variable in the file, but for
// every OBS (8 byte unit); i.e., if a string is split into multiple
// OBS units, each one will have its own RT2 record -- L.A.).
// Each field constists of a fixed (32-byte) segment and
// then a few variable segments:
// if the variable has a label (3rd INT4 set to 1), then there's 4 more
// bytes specifying the length of the label, and then that many bytes
// holding the label itself (no more than 256).
// Then if there are optional missing value units (4th INT4 set to 1)
// there will be 3 more OBS units attached = 24 extra bytes.
int variableCounter = 0;
int obsSeqNumber = 0;
int j;
dbgLog.fine("RT2: Reading " + OBSUnitsPerCase + " OBS units.");
for (j = 0; j < OBSUnitsPerCase; j++) {
dbgLog.fine("RT2: " + j + "-th RT2 unit is being decoded.");
// 2.0: read the fixed[=non-optional] 32-byte segment
byte[] recordType2Fixed = new byte[LENGTH_RECORDTYPE2_FIXED];
try {
int nbytes = stream.read(recordType2Fixed, 0, LENGTH_RECORDTYPE2_FIXED);
if (nbytes == 0) {
throw new IOException("reading recordType2: no bytes read!");
}
int offset = 0;
// 2.1: create int-view of the bytebuffer for the first 16-byte segment
int rt2_1st_4_units = 4;
ByteBuffer[] bb_record_type2_fixed_part1 = new ByteBuffer[rt2_1st_4_units];
int[] recordType2FixedPart1 = new int[rt2_1st_4_units];
for (int i = 0; i < rt2_1st_4_units; i++) {
bb_record_type2_fixed_part1[i] = ByteBuffer.wrap(recordType2Fixed, offset, LENGTH_SAV_INT_BLOCK);
offset += LENGTH_SAV_INT_BLOCK;
if (isLittleEndian) {
bb_record_type2_fixed_part1[i].order(ByteOrder.LITTLE_ENDIAN);
}
recordType2FixedPart1[i] = bb_record_type2_fixed_part1[i].getInt();
}
// 1st ([0]) element must be 2 otherwise no longer Record Type 2
if (recordType2FixedPart1[0] != 2) {
dbgLog.warning(j + "-th RT header value is no longet RT2! " + recordType2FixedPart1[0]);
break;
}
dbgLog.fine("variable type[must be 2]=" + recordType2FixedPart1[0]);
// 2.3 variable name: 8 byte(space[x20]-padded)
// This field is located at the very end of the 32 byte
// fixed-size RT2 header (bytes 24-31).
// We are processing it now, so that
// we can make the decision on whether this variable is part
// of a compound variable:
String RawVariableName = getNullStrippedString(new String(Arrays.copyOfRange(recordType2Fixed, 24, (24 + LENGTH_VARIABLE_NAME)), defaultCharSet));
// offset +=LENGTH_VARIABLE_NAME;
String variableName = null;
if (RawVariableName.indexOf(' ') >= 0) {
variableName = RawVariableName.substring(0, RawVariableName.indexOf(' '));
} else {
variableName = RawVariableName;
}
// 2nd ([1]) element: numeric variable = 0 :for string variable
// this block indicates its datum-length, i.e, >0 ;
// if -1, this RT2 unit is a non-1st RT2 unit for a string variable
// whose value is longer than 8 character.
boolean isNumericVariable = false;
dbgLog.fine("variable type(0: numeric; > 0: String;-1 continue )=" + recordType2FixedPart1[1]);
// OBSwiseTypelList.add(recordType2FixedPart1[1]);
int HowManyRt2Units = 1;
if (recordType2FixedPart1[1] == -1) {
dbgLog.fine("this RT2 is an 8 bit continuation chunk of an earlier string variable");
if (obs255) {
if (obsSeqNumber < 30) {
OBSwiseTypelList.add(recordType2FixedPart1[1]);
obsSeqNumber++;
} else {
OBSwiseTypelList.add(-2);
obs255 = false;
obsSeqNumber = 0;
}
} else {
OBSwiseTypelList.add(recordType2FixedPart1[1]);
}
obsNonVariableBlockSet.add(j);
continue;
} else if (recordType2FixedPart1[1] == 0) {
// This is a numeric variable
extendedVariableMode = false;
// And as such, it cannot be an extension of a
// previous, long string variable.
OBSwiseTypelList.add(recordType2FixedPart1[1]);
variableCounter++;
isNumericVariable = true;
variableTypelList.add(recordType2FixedPart1[1]);
} else if (recordType2FixedPart1[1] > 0) {
if (recordType2FixedPart1[1] == 255) {
obs255 = true;
}
if (lastVariableIsExtendable) {
String varNameBase = null;
if (lastVariableName.length() > 5) {
varNameBase = lastVariableName.substring(0, 5);
} else {
varNameBase = lastVariableName;
}
if (extendedVariableMode) {
if (variableNameIsAnIncrement(varNameBase, lastExtendedVariable, variableName)) {
OBSwiseTypelList.add(-1);
lastExtendedVariable = variableName;
// OK, we stay in the "extended variable" mode;
// but we can't move on to the next OBS (hence the commented out
// "continue" below:
// continue;
// see the next comment below for the explanation.
//
// Should we also set "extendable" flag to false at this point
// if it's shorter than 255 bytes, i.e. the last extended chunk?
} else {
extendedVariableMode = false;
}
} else {
if (variableNameIsAnIncrement(varNameBase, variableName)) {
OBSwiseTypelList.add(-1);
extendedVariableMode = true;
dbgLog.fine("RT2: in extended variable mode; variable " + variableName);
lastExtendedVariable = variableName;
// Before we move on to the next OBS unit, we need to check
// if this current extended variable has its own label specified;
// If so, we need to determine its length, then read and skip
// that many bytes.
// Hence the commented out "continue" below:
// continue;
}
}
}
if (!extendedVariableMode) {
// OK, this is a "real"
// string variable, and not a continuation chunk of a compound
// string.
OBSwiseTypelList.add(recordType2FixedPart1[1]);
variableCounter++;
if (recordType2FixedPart1[1] == 255) {
// This variable is 255 bytes long, i.e. this is
// either the single "atomic" variable of the
// max allowed size, or it's a 255 byte segment
// of a compound variable. So we will check
// the next variable and see if it is the continuation
// of this one.
lastVariableIsExtendable = true;
} else {
lastVariableIsExtendable = false;
}
if (recordType2FixedPart1[1] % LENGTH_SAV_OBS_BLOCK == 0) {
HowManyRt2Units = recordType2FixedPart1[1] / LENGTH_SAV_OBS_BLOCK;
} else {
HowManyRt2Units = recordType2FixedPart1[1] / LENGTH_SAV_OBS_BLOCK + 1;
}
variableTypelList.add(recordType2FixedPart1[1]);
}
}
if (!extendedVariableMode) {
// Again, we only want to do the following steps for the "real"
// variables, not the chunks of split mega-variables:
dbgLog.fine("RT2: HowManyRt2Units for this variable=" + HowManyRt2Units);
lastVariableName = variableName;
// caseWeightVariableOBSIndex starts from 1: 0 is used for does-not-exist cases
if (j == (caseWeightVariableOBSIndex - 1)) {
caseWeightVariableName = variableName;
// TODO: do we need this "index"? -- 4.0 alpha
caseWeightVariableIndex = variableCounter;
// /smd.setCaseWeightVariableName(caseWeightVariableName);
// /smd.getFileInformation().put("caseWeightVariableIndex", caseWeightVariableIndex);
}
OBSIndexToVariableName.put(j, variableName);
// dbgLog.fine("\nvariable name="+variableName+"<-");
dbgLog.fine("RT2: " + j + "-th variable name=" + variableName + "<-");
dbgLog.fine("RT2: raw variable: " + RawVariableName);
variableNameList.add(variableName);
}
// 3rd ([2]) element: = 1 variable-label block follows; 0 = no label
//
dbgLog.fine("RT: variable label follows?(1:yes; 0: no)=" + recordType2FixedPart1[2]);
boolean hasVariableLabel = recordType2FixedPart1[2] == 1 ? true : false;
if ((recordType2FixedPart1[2] != 0) && (recordType2FixedPart1[2] != 1)) {
throw new IOException("RT2: reading error: value is neither 0 or 1" + recordType2FixedPart1[2]);
}
if (hasVariableLabel) {
byte[] length_variable_label = new byte[4];
int nbytes_2_4 = stream.read(length_variable_label);
if (nbytes_2_4 == 0) {
throw new IOException("RT 2: error reading recordType2.4: no bytes read!");
} else {
dbgLog.fine("nbytes_2_4=" + nbytes_2_4);
}
ByteBuffer bb_length_variable_label = ByteBuffer.wrap(length_variable_label, 0, LENGTH_VARIABLE_LABEL);
if (isLittleEndian) {
bb_length_variable_label.order(ByteOrder.LITTLE_ENDIAN);
}
int rawVariableLabelLength = bb_length_variable_label.getInt();
dbgLog.fine("rawVariableLabelLength=" + rawVariableLabelLength);
int variableLabelLength = getSAVintAdjustedBlockLength(rawVariableLabelLength);
dbgLog.fine("RT2: variableLabelLength=" + variableLabelLength);
// 2.5 [optional]variable label whose length is found at 2.4
String variableLabel = "";
if (rawVariableLabelLength > 0) {
byte[] variable_label = new byte[variableLabelLength];
int nbytes_2_5 = stream.read(variable_label);
if (nbytes_2_5 == 0) {
throw new IOException("RT 2: error reading recordType2.5: " + variableLabelLength + " bytes requested, no bytes read!");
} else {
dbgLog.fine("nbytes_2_5=" + nbytes_2_5);
}
variableLabel = getNullStrippedString(new String(Arrays.copyOfRange(variable_label, 0, rawVariableLabelLength), defaultCharSet));
dbgLog.fine("RT2: variableLabel=" + variableLabel + "<-");
dbgLog.fine(variableName + " => " + variableLabel);
} else {
dbgLog.fine("RT2: defaulting to empty variable label.");
}
if (!extendedVariableMode) {
// We only have any use for this label if it's a "real" variable.
// Thinking about it, it doesn't make much sense for the "fake"
// variables that are actually chunks of large strings to store
// their own labels. But in some files they do. Then failing to read
// the bytes would result in getting out of sync with the RT record
// borders. So we always read the bytes, but only use them for
// the real variable entries.
/*String variableLabel = new String(Arrays.copyOfRange(variable_label,
0, rawVariableLabelLength),"US-ASCII");*/
variableLabelMap.put(variableName, variableLabel);
}
}
if (extendedVariableMode) {
// that's how SPSS stores them.
continue;
}
// 4th ([3]) element: Missing value type code
// 0[none], 1, 2, 3 [point-type],-2[range], -3 [range type+ point]
dbgLog.fine("RT: missing value unit follows?(if 0, none)=" + recordType2FixedPart1[3]);
boolean hasMissingValues = (validMissingValueCodeSet.contains(recordType2FixedPart1[3]) && (recordType2FixedPart1[3] != 0)) ? true : false;
InvalidData invalidDataInfo = null;
if (recordType2FixedPart1[3] != 0) {
invalidDataInfo = new InvalidData(recordType2FixedPart1[3]);
dbgLog.fine("RT: missing value type=" + invalidDataInfo.getType());
}
// 2.2: print/write formats: 4-byte each = 8 bytes
byte[] printFormt = Arrays.copyOfRange(recordType2Fixed, offset, offset + LENGTH_PRINT_FORMAT_CODE);
dbgLog.fine("printFrmt=" + new String(Hex.encodeHex(printFormt)));
offset += LENGTH_PRINT_FORMAT_CODE;
int formatCode = isLittleEndian ? printFormt[2] : printFormt[1];
int formatWidth = isLittleEndian ? printFormt[1] : printFormt[2];
// TODO:
// What should we be doing with these "format decimal positions"
// in 4.0?
// -- L.A. 4.0 alpha
int formatDecimalPointPosition = isLittleEndian ? printFormt[0] : printFormt[3];
dbgLog.fine("RT2: format code{5=F, 1=A[String]}=" + formatCode);
formatDecimalPointPositionList.add(formatDecimalPointPosition);
if (!SPSSConstants.FORMAT_CODE_TABLE_SAV.containsKey(formatCode)) {
throw new IOException("Unknown format code was found = " + formatCode);
} else {
printFormatList.add(formatCode);
}
byte[] writeFormt = Arrays.copyOfRange(recordType2Fixed, offset, offset + LENGTH_WRITE_FORMAT_CODE);
dbgLog.fine("RT2: writeFrmt=" + new String(Hex.encodeHex(writeFormt)));
if (writeFormt[3] != 0x00) {
dbgLog.fine("byte-order(write format): reversal required");
}
offset += LENGTH_WRITE_FORMAT_CODE;
if (!SPSSConstants.ORDINARY_FORMAT_CODE_SET.contains(formatCode)) {
StringBuilder sb = new StringBuilder(SPSSConstants.FORMAT_CODE_TABLE_SAV.get(formatCode) + formatWidth);
if (formatDecimalPointPosition > 0) {
sb.append("." + formatDecimalPointPosition);
}
dbgLog.fine("formattable[i] = " + variableName + " -> " + sb.toString());
printFormatNameTable.put(variableName, sb.toString());
}
printFormatTable.put(variableName, SPSSConstants.FORMAT_CODE_TABLE_SAV.get(formatCode));
if (hasMissingValues) {
dbgLog.fine("RT2: decoding missing value: type=" + recordType2FixedPart1[3]);
int howManyMissingValueUnits = missingValueCodeUnits.get(recordType2FixedPart1[3]);
// int howManyMissingValueUnits = recordType2FixedPart1[3] > 0 ? recordType2FixedPart1[3] : 0;
dbgLog.fine("RT2: howManyMissingValueUnits=" + howManyMissingValueUnits);
byte[] missing_value_code_units = new byte[LENGTH_SAV_OBS_BLOCK * howManyMissingValueUnits];
int nbytes_2_6 = stream.read(missing_value_code_units);
if (nbytes_2_6 == 0) {
throw new IOException("RT 2: reading recordType2.6: no byte was read");
} else {
dbgLog.fine("nbytes_2_6=" + nbytes_2_6);
}
if (isNumericVariable) {
double[] missingValues = new double[howManyMissingValueUnits];
// List<String> mvp = new ArrayList<String>();
List<String> mv = new ArrayList<String>();
ByteBuffer[] bb_missig_value_code = new ByteBuffer[howManyMissingValueUnits];
int offset_start = 0;
for (int i = 0; i < howManyMissingValueUnits; i++) {
bb_missig_value_code[i] = ByteBuffer.wrap(missing_value_code_units, offset_start, LENGTH_SAV_OBS_BLOCK);
offset_start += LENGTH_SAV_OBS_BLOCK;
if (isLittleEndian) {
bb_missig_value_code[i].order(ByteOrder.LITTLE_ENDIAN);
}
ByteBuffer temp = bb_missig_value_code[i].duplicate();
missingValues[i] = bb_missig_value_code[i].getDouble();
if (Double.toHexString(missingValues[i]).equals("-0x1.ffffffffffffep1023")) {
dbgLog.fine("1st value is LOWEST");
mv.add(Double.toHexString(missingValues[i]));
} else if (Double.valueOf(missingValues[i]).equals(Double.MAX_VALUE)) {
dbgLog.fine("2nd value is HIGHEST");
mv.add(Double.toHexString(missingValues[i]));
} else {
mv.add(doubleNumberFormatter.format(missingValues[i]));
}
dbgLog.fine(i + "-th missing value=" + Double.toHexString(missingValues[i]));
}
dbgLog.fine("variableName=" + variableName);
if (recordType2FixedPart1[3] > 0) {
// point cases only
dbgLog.fine("mv(>0)=" + mv);
missingValueTable.put(variableName, mv);
invalidDataInfo.setInvalidValues(mv);
} else if (recordType2FixedPart1[3] == -2) {
dbgLog.fine("mv(-2)=" + mv);
// range
invalidDataInfo.setInvalidRange(mv);
} else if (recordType2FixedPart1[3] == -3) {
// mixed case
dbgLog.fine("mv(-3)=" + mv);
invalidDataInfo.setInvalidRange(mv.subList(0, 2));
invalidDataInfo.setInvalidValues(mv.subList(2, 3));
missingValueTable.put(variableName, mv.subList(2, 3));
}
dbgLog.fine("missing value=" + StringUtils.join(missingValueTable.get(variableName), "|"));
dbgLog.fine("invalidDataInfo(Numeric):\n" + invalidDataInfo);
invalidDataTable.put(variableName, invalidDataInfo);
} else {
// string variable case
String[] missingValues = new String[howManyMissingValueUnits];
List<String> mv = new ArrayList<String>();
int offset_start = 0;
int offset_end = LENGTH_SAV_OBS_BLOCK;
for (int i = 0; i < howManyMissingValueUnits; i++) {
missingValues[i] = StringUtils.stripEnd(new String(Arrays.copyOfRange(missing_value_code_units, offset_start, offset_end), defaultCharSet), " ");
dbgLog.fine("missing value=" + missingValues[i] + "<-");
offset_start = offset_end;
offset_end += LENGTH_SAV_OBS_BLOCK;
mv.add(missingValues[i]);
}
invalidDataInfo.setInvalidValues(mv);
missingValueTable.put(variableName, mv);
invalidDataTable.put(variableName, invalidDataInfo);
dbgLog.fine("missing value(str)=" + StringUtils.join(missingValueTable.get(variableName), "|"));
dbgLog.fine("invalidDataInfo(String):\n" + invalidDataInfo);
}
// string case
dbgLog.fine("invalidDataTable:\n" + invalidDataTable);
}
// if msv
} catch (IOException ex) {
// ex.printStackTrace();
throw ex;
} catch (Exception ex) {
ex.printStackTrace();
// should we be throwing some exception here?
}
}
if (j != OBSUnitsPerCase) {
dbgLog.fine("RT2: attention! didn't reach the end of the OBS list!");
throw new IOException("RT2: didn't reach the end of the OBS list!");
}
dbgLog.fine("RT2 metadata-related exit-chores");
// /smd.getFileInformation().put("varQnty", variableCounter);
dataTable.setVarQuantity(new Long(variableCounter));
dbgLog.fine("RT2: varQnty=" + variableCounter);
// 4.0 Initialize variables:
List<DataVariable> variableList = new ArrayList<DataVariable>();
for (int i = 0; i < variableCounter; i++) {
DataVariable dv = new DataVariable();
String varName = variableNameList.get(i);
dbgLog.fine("name: " + varName);
dv.setName(varName);
String varLabel = variableLabelMap.get(varName);
if (varLabel != null && varLabel.length() > 255) {
// TODO:
// variable labels will be changed into type 'TEXT' in the
// database - this will eliminate the 255 char. limit.
// -- L.A. 4.0 beta11
dbgLog.fine("Have to truncate label: " + varLabel);
varLabel = varLabel.substring(0, 255);
}
dbgLog.fine("label: " + varLabel);
dv.setLabel(varLabel);
dv.setInvalidRanges(new ArrayList<VariableRange>());
dv.setSummaryStatistics(new ArrayList<SummaryStatistic>());
dv.setUnf("UNF:6:");
dv.setCategories(new ArrayList<VariableCategory>());
variableList.add(dv);
dv.setFileOrder(i);
dv.setDataTable(dataTable);
}
dataTable.setDataVariables(variableList);
// /smd.setVariableName(variableNameList.toArray(new String[variableNameList.size()]));
// /smd.setVariableLabel(variableLabelMap);
// TODO:
// figure out what to do with the missing value table!
// -- 4.0 alpha
// well, they were used to generate merged summary statistics for
// the variable. So need to verify what the DDI import was doing
// with them and replicate the same in 4.0.
// (add appropriate value labels?)
// /TODO: 4.0 smd.setMissingValueTable(missingValueTable);
// /smd.getFileInformation().put("caseWeightVariableName", caseWeightVariableName);
dbgLog.fine("sumstat:long case=" + Arrays.deepToString(variableTypelList.toArray()));
dbgLog.fine("RT2: OBSwiseTypelList=" + OBSwiseTypelList);
dbgLog.fine("decodeRecordType2(): end");
}
use of edu.harvard.iq.dataverse.datavariable.DataVariable in project dataverse by IQSS.
the class RJobRequest method getVariableLabels.
/**
* Getter for property variable labels
*
* @return A String array of variable labels
*/
public String[] getVariableLabels() {
String[] variableLabels = null;
List<String> rw = new ArrayList<>();
for (DataVariable dv : dataVariablesForRequest) {
rw.add(dv.getLabel());
}
variableLabels = rw.toArray(new String[rw.size()]);
return variableLabels;
}
Aggregations