use of edu.harvard.iq.dataverse.DataTable in project dataverse by IQSS.
the class CSVFileReader method read.
/**
* Reads a CSV file, converts it into a dataverse DataTable.
*
* @param stream a <code>BufferedInputStream</code>.
* @return an <code>TabularDataIngest</code> object
* @throws java.io.IOException if a reading error occurs.
*/
@Override
public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws IOException {
init();
if (stream == null) {
throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.nullStream"));
}
TabularDataIngest ingesteddata = new TabularDataIngest();
DataTable dataTable = new DataTable();
BufferedReader localBufferedReader = new BufferedReader(new InputStreamReader(stream));
File tabFileDestination = File.createTempFile("data-", ".tab");
PrintWriter tabFileWriter = new PrintWriter(tabFileDestination.getAbsolutePath());
int lineCount = readFile(localBufferedReader, dataTable, tabFileWriter);
dbglog.fine("Tab file produced: " + tabFileDestination.getAbsolutePath());
dataTable.setUnf("UNF:6:NOTCALCULATED");
ingesteddata.setTabDelimitedFile(tabFileDestination);
ingesteddata.setDataTable(dataTable);
return ingesteddata;
}
use of edu.harvard.iq.dataverse.DataTable in project dataverse by IQSS.
the class XLSXFileReader method read.
/**
* Reads an XLSX file, converts it into a dataverse DataTable.
*
* @param stream a <code>BufferedInputStream</code>.
* @param ignored
* @return an <code>TabularDataIngest</code> object
* @throws java.io.IOException if a reading error occurs.
*/
@Override
public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws IOException {
init();
TabularDataIngest ingesteddata = new TabularDataIngest();
DataTable dataTable = new DataTable();
File firstPassTempFile = File.createTempFile("firstpass-", ".tab");
PrintWriter firstPassWriter = new PrintWriter(firstPassTempFile.getAbsolutePath());
try {
processSheet(stream, dataTable, firstPassWriter);
} catch (Exception ex) {
throw new IOException("Could not parse Excel/XLSX spreadsheet. " + ex.getMessage());
}
if (dataTable.getCaseQuantity() == null || dataTable.getCaseQuantity().intValue() < 1) {
String errorMessage;
if (dataTable.getVarQuantity() == null || dataTable.getVarQuantity().intValue() < 1) {
errorMessage = "No rows of data found in the Excel (XLSX) file.";
} else {
errorMessage = "Only one row of data (column name header?) detected in the Excel (XLSX) file.";
}
throw new IOException(errorMessage);
}
// 2nd pass:
File tabFileDestination = File.createTempFile("data-", ".tab");
PrintWriter finalWriter = new PrintWriter(tabFileDestination.getAbsolutePath());
BufferedReader secondPassReader = new BufferedReader(new FileReader(firstPassTempFile));
int varQnty = dataTable.getVarQuantity().intValue();
int lineCounter = 0;
String line = null;
String[] caseRow = new String[varQnty];
String[] valueTokens;
while ((line = secondPassReader.readLine()) != null) {
// chop the line:
line = line.replaceFirst("[\r\n]*$", "");
valueTokens = line.split("" + delimiterChar, -2);
if (valueTokens == null) {
throw new IOException("Failed to read line " + (lineCounter + 1) + " during the second pass.");
}
if (valueTokens.length != varQnty) {
throw new IOException("Reading mismatch, line " + (lineCounter + 1) + " during the second pass: " + varQnty + " delimited values expected, " + valueTokens.length + " found.");
}
for (int i = 0; i < varQnty; i++) {
if (dataTable.getDataVariables().get(i).isTypeNumeric()) {
if (valueTokens[i] == null || valueTokens[i].equals(".") || valueTokens[i].equals("") || valueTokens[i].equalsIgnoreCase("NA")) {
// Missing value - represented as an empty string in
// the final tab file
caseRow[i] = "";
} else if (valueTokens[i].equalsIgnoreCase("NaN")) {
// "Not a Number" special value:
caseRow[i] = "NaN";
} else if (valueTokens[i].equalsIgnoreCase("Inf") || valueTokens[i].equalsIgnoreCase("+Inf")) {
// Positive infinity:
caseRow[i] = "Inf";
} else if (valueTokens[i].equalsIgnoreCase("-Inf")) {
// Negative infinity:
caseRow[i] = "-Inf";
} else if (valueTokens[i].equalsIgnoreCase("null")) {
// By request from Gus - "NULL" is recognized as a
// numeric zero:
caseRow[i] = "0";
} else {
try {
Double testDoubleValue = new Double(valueTokens[i]);
caseRow[i] = testDoubleValue.toString();
} catch (Exception ex) {
throw new IOException("Failed to parse a value recognized as numeric in the first pass! column: " + i + ", value: " + valueTokens[i]);
}
}
} else {
if (valueTokens[i] != null && !valueTokens[i].equals(".")) {
String charToken = valueTokens[i];
// Dealing with quotes:
// remove the leading and trailing quotes, if present:
charToken = charToken.replaceFirst("^\"", "");
charToken = charToken.replaceFirst("\"$", "");
// escape the remaining ones:
charToken = charToken.replace("\"", "\\\"");
// final pair of quotes:
charToken = "\"" + charToken + "\"";
caseRow[i] = charToken;
} else {
caseRow[i] = "";
}
}
}
finalWriter.println(StringUtils.join(caseRow, "\t"));
lineCounter++;
}
secondPassReader.close();
finalWriter.close();
if (dataTable.getCaseQuantity().intValue() != lineCounter) {
throw new IOException("Mismatch between line counts in first and final passes!");
}
dataTable.setUnf("UNF:6:NOTCALCULATED");
ingesteddata.setTabDelimitedFile(tabFileDestination);
ingesteddata.setDataTable(dataTable);
dbglog.fine("Produced temporary file " + ingesteddata.getTabDelimitedFile().getAbsolutePath());
dbglog.fine("Found " + dataTable.getVarQuantity() + " variables, " + dataTable.getCaseQuantity() + " observations.");
String varNames = null;
for (int i = 0; i < dataTable.getVarQuantity().intValue(); i++) {
if (varNames == null) {
varNames = dataTable.getDataVariables().get(i).getName();
} else {
varNames = varNames + ", " + dataTable.getDataVariables().get(i).getName();
}
}
dbglog.fine("Variable names: " + varNames);
return ingesteddata;
}
use of edu.harvard.iq.dataverse.DataTable in project dataverse by IQSS.
the class XLSXFileReader method main.
public static void main(String[] args) throws Exception {
XLSXFileReader testReader = new XLSXFileReader(new XLSXFileReaderSpi());
DataTable dataTable;
BufferedInputStream xlsxInputStream = new BufferedInputStream(new FileInputStream(new File(args[0])));
TabularDataIngest dataIngest = testReader.read(xlsxInputStream, null);
dataTable = dataIngest.getDataTable();
System.out.println("Produced temporary file " + dataIngest.getTabDelimitedFile().getAbsolutePath());
System.out.println("Found " + dataTable.getVarQuantity() + " variables, " + dataTable.getCaseQuantity() + " observations.");
System.out.println("Variable names:");
for (int i = 0; i < dataTable.getVarQuantity().intValue(); i++) {
System.out.println(dataTable.getDataVariables().get(i).getName());
}
}
use of edu.harvard.iq.dataverse.DataTable in project dataverse by IQSS.
the class IngestUtilTest method testCheckForDuplicateFileNamesTabular.
@Test
public /**
* Test tabular files (e.g., .dta) are changed when .tab files with the same
* name exist.
*/
void testCheckForDuplicateFileNamesTabular() throws Exception {
SimpleDateFormat dateFmt = new SimpleDateFormat("yyyyMMdd");
// create dataset
Dataset dataset = makeDataset();
// create dataset version
DatasetVersion datasetVersion = dataset.getEditVersion();
datasetVersion.setCreateTime(dateFmt.parse("20001012"));
datasetVersion.setLastUpdateTime(datasetVersion.getLastUpdateTime());
datasetVersion.setId(MocksFactory.nextId());
datasetVersion.setReleaseTime(dateFmt.parse("20010101"));
datasetVersion.setVersionState(DatasetVersion.VersionState.RELEASED);
datasetVersion.setMinorVersionNumber(0L);
datasetVersion.setVersionNumber(1L);
datasetVersion.setFileMetadatas(new ArrayList<>());
// create datafiles
List<DataFile> dataFileList = new ArrayList<>();
DataFile datafile1 = new DataFile("application/x-strata");
datafile1.setStorageIdentifier("foobar.dta");
datafile1.setFilesize(200);
datafile1.setModificationTime(new Timestamp(new Date().getTime()));
datafile1.setCreateDate(new Timestamp(new Date().getTime()));
datafile1.setPermissionModificationTime(new Timestamp(new Date().getTime()));
datafile1.setOwner(dataset);
datafile1.setIngestDone();
datafile1.setChecksumType(DataFile.ChecksumType.SHA1);
datafile1.setChecksumValue("Unknown");
DataTable dt1 = new DataTable();
dt1.setOriginalFileFormat("application/x-stata");
datafile1.setDataTable(dt1);
// set metadata and add version
FileMetadata fmd1 = new FileMetadata();
fmd1.setId(1L);
fmd1.setLabel("foobar.tab");
fmd1.setDataFile(datafile1);
datafile1.getFileMetadatas().add(fmd1);
datasetVersion.getFileMetadatas().add(fmd1);
fmd1.setDatasetVersion(datasetVersion);
DataFile datafile2 = new DataFile("application/x-strata");
datafile2.setStorageIdentifier("foobar.dta");
datafile2.setFilesize(200);
datafile2.setModificationTime(new Timestamp(new Date().getTime()));
datafile2.setCreateDate(new Timestamp(new Date().getTime()));
datafile2.setPermissionModificationTime(new Timestamp(new Date().getTime()));
datafile2.setOwner(dataset);
datafile2.setIngestDone();
datafile2.setChecksumType(DataFile.ChecksumType.SHA1);
datafile2.setChecksumValue("Unknown");
DataTable dt2 = new DataTable();
dt2.setOriginalFileFormat("application/x-stata");
datafile2.setDataTable(dt2);
// set metadata and add version
FileMetadata fmd2 = new FileMetadata();
fmd2.setId(2L);
fmd2.setLabel("foobar.dta");
fmd2.setDataFile(datafile2);
datafile2.getFileMetadatas().add(fmd2);
dataFileList.add(datafile2);
IngestUtil.checkForDuplicateFileNamesFinal(datasetVersion, dataFileList);
boolean file2NameAltered = false;
for (DataFile df : dataFileList) {
if (df.getFileMetadata().getLabel().equals("foobar-1.dta")) {
file2NameAltered = true;
}
}
// check filename is altered since tabular and will change to .tab after ingest
assertEquals(file2NameAltered, true);
}
use of edu.harvard.iq.dataverse.DataTable in project dataverse by IQSS.
the class CSVFileReaderTest method testSubset.
/*
* This test will read a CSV file, then attempt to subset
* the resulting tab-delimited file and verify that the individual variable vectors
* are legit.
*/
@Test
public void testSubset() {
String testFile = "src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/election_precincts.csv";
Long expectedNumberOfVariables = 13L;
// aka the number of lines in the TAB file produced by the ingest plugin
Long expectedNumberOfCases = 24L;
TabularDataIngest ingestResult = null;
File generatedTabFile = null;
DataTable generatedDataTable = null;
try (BufferedInputStream stream = new BufferedInputStream(new FileInputStream(testFile))) {
CSVFileReader instance = new CSVFileReader(new CSVFileReaderSpi());
ingestResult = instance.read(stream, null);
generatedTabFile = ingestResult.getTabDelimitedFile();
generatedDataTable = ingestResult.getDataTable();
} catch (IOException ex) {
fail("" + ex);
}
assertNotNull(generatedDataTable);
assertNotNull(generatedDataTable.getDataVariables());
assertEquals(generatedDataTable.getVarQuantity(), new Long(generatedDataTable.getDataVariables().size()));
assertEquals(generatedDataTable.getVarQuantity(), expectedNumberOfVariables);
assertEquals(expectedNumberOfCases, generatedDataTable.getCaseQuantity());
// And now let's try and subset the individual vectors
// First, the "continuous" vectors (we should be able to read these as Double[]):
int[] floatColumns = { 2 };
Double[][] floatVectors = { { 1.0, 3.0, 4.0, 6.0, 7.0, 8.0, 11.0, 12.0, 76.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0 } };
int vectorCount = 0;
for (int i : floatColumns) {
if (!generatedDataTable.getDataVariables().get(i).isIntervalContinuous()) {
fail("Column " + i + " was not properly processed as \"continuous\"");
}
FileInputStream generatedTabInputStream = null;
try {
generatedTabInputStream = new FileInputStream(generatedTabFile);
} catch (FileNotFoundException ioex) {
fail("Failed to open generated tab-delimited file for reading" + ioex);
}
Double[] columnVector = TabularSubsetGenerator.subsetDoubleVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue());
assertArrayEquals("column " + i + ":", floatVectors[vectorCount++], columnVector);
}
// Discrete Numerics (aka, integers):
int[] integerColumns = { 1, 4, 6, 7, 8, 9, 10, 11, 12 };
Long[][] longVectors = { { 1L, 3L, 4L, 6L, 7L, 8L, 11L, 12L, 76L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L }, { 1L, 2L, 3L, 4L, 5L, 11L, 13L, 15L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L }, { 85729227L, 85699791L, 640323976L, 85695847L, 637089796L, 637089973L, 85695001L, 85695077L, 1111111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L }, { 205871733L, 205871735L, 205871283L, 258627915L, 257444575L, 205871930L, 260047422L, 262439738L, 1111111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L }, { 205871673L, 205871730L, 205871733L, 205872857L, 258627915L, 257444584L, 205873413L, 262439738L, 1111111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L }, { 25025000201L, 25025081001L, 25025000701L, 25025050901L, 25025040600L, 25025000502L, 25025040401L, 25025100900L, 1111111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L }, { 250250502002L, 250250502003L, 250250501013L, 250250408011L, 250250503001L, 250250103001L, 250250406002L, 250250406001L, 1111111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L }, { 250251011024001L, 250251011013003L, 250251304041007L, 250251011013006L, 250251010016000L, 250251011024002L, 250251001005004L, 250251002003002L, 1111111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L }, { 2109L, 2110L, 2111L, 2120L, 2121L, 2115L, 2116L, 2122L, 11111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L } };
vectorCount = 0;
for (int i : integerColumns) {
if (!generatedDataTable.getDataVariables().get(i).isIntervalDiscrete() || !generatedDataTable.getDataVariables().get(i).isTypeNumeric()) {
fail("Column " + i + " was not properly processed as \"discrete numeric\"");
}
FileInputStream generatedTabInputStream = null;
try {
generatedTabInputStream = new FileInputStream(generatedTabFile);
} catch (FileNotFoundException ioex) {
fail("Failed to open generated tab-delimited file for reading" + ioex);
}
Long[] columnVector = TabularSubsetGenerator.subsetLongVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue());
assertArrayEquals("column " + i + ":", longVectors[vectorCount++], columnVector);
}
// And finally, Strings:
int[] stringColumns = { 0, 3, 5 };
String[][] stringVectors = { { "Dog", "Squirrel", "Antelope", "Zebra", "Lion", "Gazelle", "Cat", "Giraffe", "Cat", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey" }, { "East Boston", "Charlestown", "South Boston", "Bronx", "Roslindale", "Mission Hill", "Jamaica Plain", "Hyde Park", "Fenway/Kenmore", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens" }, { "2-06", "1-09", "1-1A", "1-1B", "2-04", "3-05", "1-1C", "1-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A" } };
vectorCount = 0;
for (int i : stringColumns) {
if (!generatedDataTable.getDataVariables().get(i).isTypeCharacter()) {
fail("Column " + i + " was not properly processed as a character vector");
}
FileInputStream generatedTabInputStream = null;
try {
generatedTabInputStream = new FileInputStream(generatedTabFile);
} catch (FileNotFoundException ioex) {
fail("Failed to open generated tab-delimited file for reading" + ioex);
}
String[] columnVector = TabularSubsetGenerator.subsetStringVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue());
assertArrayEquals("column " + i + ":", stringVectors[vectorCount++], columnVector);
}
}
Aggregations