use of edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest in project dataverse by IQSS.
the class CSVFileReader method read.
/**
* Reads a CSV file, converts it into a dataverse DataTable.
*
* @param stream a <code>BufferedInputStream</code>.
* @return an <code>TabularDataIngest</code> object
* @throws java.io.IOException if a reading error occurs.
*/
@Override
public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws IOException {
init();
if (stream == null) {
throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.nullStream"));
}
TabularDataIngest ingesteddata = new TabularDataIngest();
DataTable dataTable = new DataTable();
BufferedReader localBufferedReader = new BufferedReader(new InputStreamReader(stream));
File tabFileDestination = File.createTempFile("data-", ".tab");
PrintWriter tabFileWriter = new PrintWriter(tabFileDestination.getAbsolutePath());
int lineCount = readFile(localBufferedReader, dataTable, tabFileWriter);
dbglog.fine("Tab file produced: " + tabFileDestination.getAbsolutePath());
dataTable.setUnf("UNF:6:NOTCALCULATED");
ingesteddata.setTabDelimitedFile(tabFileDestination);
ingesteddata.setDataTable(dataTable);
return ingesteddata;
}
use of edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest in project dataverse by IQSS.
the class XLSXFileReader method read.
/**
* Reads an XLSX file, converts it into a dataverse DataTable.
*
* @param stream a <code>BufferedInputStream</code>.
* @param ignored
* @return an <code>TabularDataIngest</code> object
* @throws java.io.IOException if a reading error occurs.
*/
@Override
public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws IOException {
init();
TabularDataIngest ingesteddata = new TabularDataIngest();
DataTable dataTable = new DataTable();
File firstPassTempFile = File.createTempFile("firstpass-", ".tab");
PrintWriter firstPassWriter = new PrintWriter(firstPassTempFile.getAbsolutePath());
try {
processSheet(stream, dataTable, firstPassWriter);
} catch (Exception ex) {
throw new IOException("Could not parse Excel/XLSX spreadsheet. " + ex.getMessage());
}
if (dataTable.getCaseQuantity() == null || dataTable.getCaseQuantity().intValue() < 1) {
String errorMessage;
if (dataTable.getVarQuantity() == null || dataTable.getVarQuantity().intValue() < 1) {
errorMessage = "No rows of data found in the Excel (XLSX) file.";
} else {
errorMessage = "Only one row of data (column name header?) detected in the Excel (XLSX) file.";
}
throw new IOException(errorMessage);
}
// 2nd pass:
File tabFileDestination = File.createTempFile("data-", ".tab");
PrintWriter finalWriter = new PrintWriter(tabFileDestination.getAbsolutePath());
BufferedReader secondPassReader = new BufferedReader(new FileReader(firstPassTempFile));
int varQnty = dataTable.getVarQuantity().intValue();
int lineCounter = 0;
String line = null;
String[] caseRow = new String[varQnty];
String[] valueTokens;
while ((line = secondPassReader.readLine()) != null) {
// chop the line:
line = line.replaceFirst("[\r\n]*$", "");
valueTokens = line.split("" + delimiterChar, -2);
if (valueTokens == null) {
throw new IOException("Failed to read line " + (lineCounter + 1) + " during the second pass.");
}
if (valueTokens.length != varQnty) {
throw new IOException("Reading mismatch, line " + (lineCounter + 1) + " during the second pass: " + varQnty + " delimited values expected, " + valueTokens.length + " found.");
}
for (int i = 0; i < varQnty; i++) {
if (dataTable.getDataVariables().get(i).isTypeNumeric()) {
if (valueTokens[i] == null || valueTokens[i].equals(".") || valueTokens[i].equals("") || valueTokens[i].equalsIgnoreCase("NA")) {
// Missing value - represented as an empty string in
// the final tab file
caseRow[i] = "";
} else if (valueTokens[i].equalsIgnoreCase("NaN")) {
// "Not a Number" special value:
caseRow[i] = "NaN";
} else if (valueTokens[i].equalsIgnoreCase("Inf") || valueTokens[i].equalsIgnoreCase("+Inf")) {
// Positive infinity:
caseRow[i] = "Inf";
} else if (valueTokens[i].equalsIgnoreCase("-Inf")) {
// Negative infinity:
caseRow[i] = "-Inf";
} else if (valueTokens[i].equalsIgnoreCase("null")) {
// By request from Gus - "NULL" is recognized as a
// numeric zero:
caseRow[i] = "0";
} else {
try {
Double testDoubleValue = new Double(valueTokens[i]);
caseRow[i] = testDoubleValue.toString();
} catch (Exception ex) {
throw new IOException("Failed to parse a value recognized as numeric in the first pass! column: " + i + ", value: " + valueTokens[i]);
}
}
} else {
if (valueTokens[i] != null && !valueTokens[i].equals(".")) {
String charToken = valueTokens[i];
// Dealing with quotes:
// remove the leading and trailing quotes, if present:
charToken = charToken.replaceFirst("^\"", "");
charToken = charToken.replaceFirst("\"$", "");
// escape the remaining ones:
charToken = charToken.replace("\"", "\\\"");
// final pair of quotes:
charToken = "\"" + charToken + "\"";
caseRow[i] = charToken;
} else {
caseRow[i] = "";
}
}
}
finalWriter.println(StringUtils.join(caseRow, "\t"));
lineCounter++;
}
secondPassReader.close();
finalWriter.close();
if (dataTable.getCaseQuantity().intValue() != lineCounter) {
throw new IOException("Mismatch between line counts in first and final passes!");
}
dataTable.setUnf("UNF:6:NOTCALCULATED");
ingesteddata.setTabDelimitedFile(tabFileDestination);
ingesteddata.setDataTable(dataTable);
dbglog.fine("Produced temporary file " + ingesteddata.getTabDelimitedFile().getAbsolutePath());
dbglog.fine("Found " + dataTable.getVarQuantity() + " variables, " + dataTable.getCaseQuantity() + " observations.");
String varNames = null;
for (int i = 0; i < dataTable.getVarQuantity().intValue(); i++) {
if (varNames == null) {
varNames = dataTable.getDataVariables().get(i).getName();
} else {
varNames = varNames + ", " + dataTable.getDataVariables().get(i).getName();
}
}
dbglog.fine("Variable names: " + varNames);
return ingesteddata;
}
use of edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest in project dataverse by IQSS.
the class XLSXFileReader method main.
public static void main(String[] args) throws Exception {
XLSXFileReader testReader = new XLSXFileReader(new XLSXFileReaderSpi());
DataTable dataTable;
BufferedInputStream xlsxInputStream = new BufferedInputStream(new FileInputStream(new File(args[0])));
TabularDataIngest dataIngest = testReader.read(xlsxInputStream, null);
dataTable = dataIngest.getDataTable();
System.out.println("Produced temporary file " + dataIngest.getTabDelimitedFile().getAbsolutePath());
System.out.println("Found " + dataTable.getVarQuantity() + " variables, " + dataTable.getCaseQuantity() + " observations.");
System.out.println("Variable names:");
for (int i = 0; i < dataTable.getVarQuantity().intValue(); i++) {
System.out.println(dataTable.getDataVariables().get(i).getName());
}
}
use of edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest in project dataverse by IQSS.
the class CSVFileReaderTest method testSubset.
/*
* This test will read a CSV file, then attempt to subset
* the resulting tab-delimited file and verify that the individual variable vectors
* are legit.
*/
@Test
public void testSubset() {
String testFile = "src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/election_precincts.csv";
Long expectedNumberOfVariables = 13L;
// aka the number of lines in the TAB file produced by the ingest plugin
Long expectedNumberOfCases = 24L;
TabularDataIngest ingestResult = null;
File generatedTabFile = null;
DataTable generatedDataTable = null;
try (BufferedInputStream stream = new BufferedInputStream(new FileInputStream(testFile))) {
CSVFileReader instance = new CSVFileReader(new CSVFileReaderSpi());
ingestResult = instance.read(stream, null);
generatedTabFile = ingestResult.getTabDelimitedFile();
generatedDataTable = ingestResult.getDataTable();
} catch (IOException ex) {
fail("" + ex);
}
assertNotNull(generatedDataTable);
assertNotNull(generatedDataTable.getDataVariables());
assertEquals(generatedDataTable.getVarQuantity(), new Long(generatedDataTable.getDataVariables().size()));
assertEquals(generatedDataTable.getVarQuantity(), expectedNumberOfVariables);
assertEquals(expectedNumberOfCases, generatedDataTable.getCaseQuantity());
// And now let's try and subset the individual vectors
// First, the "continuous" vectors (we should be able to read these as Double[]):
int[] floatColumns = { 2 };
Double[][] floatVectors = { { 1.0, 3.0, 4.0, 6.0, 7.0, 8.0, 11.0, 12.0, 76.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0 } };
int vectorCount = 0;
for (int i : floatColumns) {
if (!generatedDataTable.getDataVariables().get(i).isIntervalContinuous()) {
fail("Column " + i + " was not properly processed as \"continuous\"");
}
FileInputStream generatedTabInputStream = null;
try {
generatedTabInputStream = new FileInputStream(generatedTabFile);
} catch (FileNotFoundException ioex) {
fail("Failed to open generated tab-delimited file for reading" + ioex);
}
Double[] columnVector = TabularSubsetGenerator.subsetDoubleVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue());
assertArrayEquals("column " + i + ":", floatVectors[vectorCount++], columnVector);
}
// Discrete Numerics (aka, integers):
int[] integerColumns = { 1, 4, 6, 7, 8, 9, 10, 11, 12 };
Long[][] longVectors = { { 1L, 3L, 4L, 6L, 7L, 8L, 11L, 12L, 76L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L }, { 1L, 2L, 3L, 4L, 5L, 11L, 13L, 15L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L }, { 85729227L, 85699791L, 640323976L, 85695847L, 637089796L, 637089973L, 85695001L, 85695077L, 1111111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L }, { 205871733L, 205871735L, 205871283L, 258627915L, 257444575L, 205871930L, 260047422L, 262439738L, 1111111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L }, { 205871673L, 205871730L, 205871733L, 205872857L, 258627915L, 257444584L, 205873413L, 262439738L, 1111111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L }, { 25025000201L, 25025081001L, 25025000701L, 25025050901L, 25025040600L, 25025000502L, 25025040401L, 25025100900L, 1111111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L }, { 250250502002L, 250250502003L, 250250501013L, 250250408011L, 250250503001L, 250250103001L, 250250406002L, 250250406001L, 1111111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L }, { 250251011024001L, 250251011013003L, 250251304041007L, 250251011013006L, 250251010016000L, 250251011024002L, 250251001005004L, 250251002003002L, 1111111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L }, { 2109L, 2110L, 2111L, 2120L, 2121L, 2115L, 2116L, 2122L, 11111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L } };
vectorCount = 0;
for (int i : integerColumns) {
if (!generatedDataTable.getDataVariables().get(i).isIntervalDiscrete() || !generatedDataTable.getDataVariables().get(i).isTypeNumeric()) {
fail("Column " + i + " was not properly processed as \"discrete numeric\"");
}
FileInputStream generatedTabInputStream = null;
try {
generatedTabInputStream = new FileInputStream(generatedTabFile);
} catch (FileNotFoundException ioex) {
fail("Failed to open generated tab-delimited file for reading" + ioex);
}
Long[] columnVector = TabularSubsetGenerator.subsetLongVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue());
assertArrayEquals("column " + i + ":", longVectors[vectorCount++], columnVector);
}
// And finally, Strings:
int[] stringColumns = { 0, 3, 5 };
String[][] stringVectors = { { "Dog", "Squirrel", "Antelope", "Zebra", "Lion", "Gazelle", "Cat", "Giraffe", "Cat", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey" }, { "East Boston", "Charlestown", "South Boston", "Bronx", "Roslindale", "Mission Hill", "Jamaica Plain", "Hyde Park", "Fenway/Kenmore", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens" }, { "2-06", "1-09", "1-1A", "1-1B", "2-04", "3-05", "1-1C", "1-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A" } };
vectorCount = 0;
for (int i : stringColumns) {
if (!generatedDataTable.getDataVariables().get(i).isTypeCharacter()) {
fail("Column " + i + " was not properly processed as a character vector");
}
FileInputStream generatedTabInputStream = null;
try {
generatedTabInputStream = new FileInputStream(generatedTabFile);
} catch (FileNotFoundException ioex) {
fail("Failed to open generated tab-delimited file for reading" + ioex);
}
String[] columnVector = TabularSubsetGenerator.subsetStringVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue());
assertArrayEquals("column " + i + ":", stringVectors[vectorCount++], columnVector);
}
}
use of edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest in project dataverse by IQSS.
the class CSVFileReaderTest method testVariableUNFs.
/*
* UNF test;
* I'd like to use a file with more interesting values - "special" numbers, freaky dates, accents, etc.
* for this. But checking it in with this simple file, for now.
* (thinking about it, the "csv file from hell" may be a better test case for the UNF test)
*/
@Test
public void testVariableUNFs() {
String testFile = "src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/election_precincts.csv";
Long expectedNumberOfVariables = 13L;
// aka the number of lines in the TAB file produced by the ingest plugin
Long expectedNumberOfCases = 24L;
String[] expectedUNFs = { "UNF:6:wb7OATtNC/leh1sOP5IGDQ==", "UNF:6:0V3xQ3ea56rzKwvGt9KBCA==", "UNF:6:0V3xQ3ea56rzKwvGt9KBCA==", "UNF:6:H9inAvq5eiIHW6lpqjjKhQ==", "UNF:6:Bh0M6QvunZwW1VoTyioRCQ==", "UNF:6:o5VTaEYz+0Kudf6hQEEupQ==", "UNF:6:eJRvbDJkIeDPrfN2dYpRfA==", "UNF:6:JD1wrtM12E7evrJJ3bRFGA==", "UNF:6:xUKbK9hb5o0nL5/mYiy7Bw==", "UNF:6:Mvq3BrdzoNhjndMiVr92Ww==", "UNF:6:KkHM6Qlyv3QlUd+BKqqB3Q==", "UNF:6:EWUVuyXKSpyllsrjHnheig==", "UNF:6:ri9JsRJxM2xpWSIq17oWNw==" };
TabularDataIngest ingestResult = null;
File generatedTabFile = null;
DataTable generatedDataTable = null;
try (BufferedInputStream stream = new BufferedInputStream(new FileInputStream(testFile))) {
CSVFileReader instance = new CSVFileReader(new CSVFileReaderSpi());
ingestResult = instance.read(stream, null);
generatedTabFile = ingestResult.getTabDelimitedFile();
generatedDataTable = ingestResult.getDataTable();
} catch (IOException ex) {
fail("" + ex);
}
assertNotNull(generatedDataTable);
assertNotNull(generatedDataTable.getDataVariables());
assertEquals(generatedDataTable.getVarQuantity(), new Long(generatedDataTable.getDataVariables().size()));
assertEquals(generatedDataTable.getVarQuantity(), expectedNumberOfVariables);
assertEquals(expectedNumberOfCases, generatedDataTable.getCaseQuantity());
for (int i = 0; i < expectedNumberOfVariables; i++) {
String unf = null;
if (generatedDataTable.getDataVariables().get(i).isIntervalContinuous()) {
FileInputStream generatedTabInputStream = null;
try {
generatedTabInputStream = new FileInputStream(generatedTabFile);
} catch (FileNotFoundException ioex) {
fail("Failed to open generated tab-delimited file for reading" + ioex);
}
Double[] columnVector = TabularSubsetGenerator.subsetDoubleVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue());
try {
unf = UNFUtil.calculateUNF(columnVector);
} catch (IOException | UnfException ioex) {
fail("Failed to generate the UNF for variable number " + i + ", (" + generatedDataTable.getDataVariables().get(i).getName() + ", floating point)");
}
}
if (generatedDataTable.getDataVariables().get(i).isIntervalDiscrete() && generatedDataTable.getDataVariables().get(i).isTypeNumeric()) {
FileInputStream generatedTabInputStream = null;
try {
generatedTabInputStream = new FileInputStream(generatedTabFile);
} catch (FileNotFoundException ioex) {
fail("Failed to open generated tab-delimited file for reading" + ioex);
}
Long[] columnVector = TabularSubsetGenerator.subsetLongVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue());
try {
unf = UNFUtil.calculateUNF(columnVector);
} catch (IOException | UnfException ioex) {
fail("Failed to generate the UNF for variable number " + i + ", (" + generatedDataTable.getDataVariables().get(i).getName() + ", integer)");
}
}
if (generatedDataTable.getDataVariables().get(i).isTypeCharacter()) {
FileInputStream generatedTabInputStream = null;
try {
generatedTabInputStream = new FileInputStream(generatedTabFile);
} catch (FileNotFoundException ioex) {
fail("Failed to open generated tab-delimited file for reading" + ioex);
}
String[] columnVector = TabularSubsetGenerator.subsetStringVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue());
String[] dateFormats = null;
// Special handling for Character strings that encode dates and times:
if ("time".equals(generatedDataTable.getDataVariables().get(i).getFormatCategory()) || "date".equals(generatedDataTable.getDataVariables().get(i).getFormatCategory())) {
dateFormats = new String[expectedNumberOfCases.intValue()];
for (int j = 0; j < expectedNumberOfCases; j++) {
dateFormats[j] = generatedDataTable.getDataVariables().get(i).getFormat();
}
}
try {
if (dateFormats == null) {
unf = UNFUtil.calculateUNF(columnVector);
} else {
unf = UNFUtil.calculateUNF(columnVector, dateFormats);
}
} catch (IOException | UnfException iex) {
fail("Failed to generate the UNF for variable number " + i + ", (" + generatedDataTable.getDataVariables().get(i).getName() + ", " + (dateFormats == null ? "String" : "Date/Time value") + ")");
}
}
assertEquals("Variable number " + i + ":", expectedUNFs[i], unf);
}
}
Aggregations