Search in sources :

Example 1 with DataTable

use of edu.harvard.iq.dataverse.DataTable in project dataverse by IQSS.

the class CSVFileReader method read.

/**
 * Reads a CSV file, converts it into a dataverse DataTable.
 *
 * @param stream a <code>BufferedInputStream</code>.
 * @return an <code>TabularDataIngest</code> object
 * @throws java.io.IOException if a reading error occurs.
 */
@Override
public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws IOException {
    init();
    if (stream == null) {
        throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.nullStream"));
    }
    TabularDataIngest ingesteddata = new TabularDataIngest();
    DataTable dataTable = new DataTable();
    BufferedReader localBufferedReader = new BufferedReader(new InputStreamReader(stream));
    File tabFileDestination = File.createTempFile("data-", ".tab");
    PrintWriter tabFileWriter = new PrintWriter(tabFileDestination.getAbsolutePath());
    int lineCount = readFile(localBufferedReader, dataTable, tabFileWriter);
    dbglog.fine("Tab file produced: " + tabFileDestination.getAbsolutePath());
    dataTable.setUnf("UNF:6:NOTCALCULATED");
    ingesteddata.setTabDelimitedFile(tabFileDestination);
    ingesteddata.setDataTable(dataTable);
    return ingesteddata;
}
Also used : DataTable(edu.harvard.iq.dataverse.DataTable) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) IOException(java.io.IOException) TabularDataIngest(edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest) File(java.io.File) PrintWriter(java.io.PrintWriter)

Example 2 with DataTable

use of edu.harvard.iq.dataverse.DataTable in project dataverse by IQSS.

the class XLSXFileReader method read.

/**
 * Reads an XLSX file, converts it into a dataverse DataTable.
 *
 * @param stream a <code>BufferedInputStream</code>.
 * @param ignored
 * @return an <code>TabularDataIngest</code> object
 * @throws java.io.IOException if a reading error occurs.
 */
@Override
public TabularDataIngest read(BufferedInputStream stream, File dataFile) throws IOException {
    init();
    TabularDataIngest ingesteddata = new TabularDataIngest();
    DataTable dataTable = new DataTable();
    File firstPassTempFile = File.createTempFile("firstpass-", ".tab");
    PrintWriter firstPassWriter = new PrintWriter(firstPassTempFile.getAbsolutePath());
    try {
        processSheet(stream, dataTable, firstPassWriter);
    } catch (Exception ex) {
        throw new IOException("Could not parse Excel/XLSX spreadsheet. " + ex.getMessage());
    }
    if (dataTable.getCaseQuantity() == null || dataTable.getCaseQuantity().intValue() < 1) {
        String errorMessage;
        if (dataTable.getVarQuantity() == null || dataTable.getVarQuantity().intValue() < 1) {
            errorMessage = "No rows of data found in the Excel (XLSX) file.";
        } else {
            errorMessage = "Only one row of data (column name header?) detected in the Excel (XLSX) file.";
        }
        throw new IOException(errorMessage);
    }
    // 2nd pass:
    File tabFileDestination = File.createTempFile("data-", ".tab");
    PrintWriter finalWriter = new PrintWriter(tabFileDestination.getAbsolutePath());
    BufferedReader secondPassReader = new BufferedReader(new FileReader(firstPassTempFile));
    int varQnty = dataTable.getVarQuantity().intValue();
    int lineCounter = 0;
    String line = null;
    String[] caseRow = new String[varQnty];
    String[] valueTokens;
    while ((line = secondPassReader.readLine()) != null) {
        // chop the line:
        line = line.replaceFirst("[\r\n]*$", "");
        valueTokens = line.split("" + delimiterChar, -2);
        if (valueTokens == null) {
            throw new IOException("Failed to read line " + (lineCounter + 1) + " during the second pass.");
        }
        if (valueTokens.length != varQnty) {
            throw new IOException("Reading mismatch, line " + (lineCounter + 1) + " during the second pass: " + varQnty + " delimited values expected, " + valueTokens.length + " found.");
        }
        for (int i = 0; i < varQnty; i++) {
            if (dataTable.getDataVariables().get(i).isTypeNumeric()) {
                if (valueTokens[i] == null || valueTokens[i].equals(".") || valueTokens[i].equals("") || valueTokens[i].equalsIgnoreCase("NA")) {
                    // Missing value - represented as an empty string in
                    // the final tab file
                    caseRow[i] = "";
                } else if (valueTokens[i].equalsIgnoreCase("NaN")) {
                    // "Not a Number" special value:
                    caseRow[i] = "NaN";
                } else if (valueTokens[i].equalsIgnoreCase("Inf") || valueTokens[i].equalsIgnoreCase("+Inf")) {
                    // Positive infinity:
                    caseRow[i] = "Inf";
                } else if (valueTokens[i].equalsIgnoreCase("-Inf")) {
                    // Negative infinity:
                    caseRow[i] = "-Inf";
                } else if (valueTokens[i].equalsIgnoreCase("null")) {
                    // By request from Gus - "NULL" is recognized as a
                    // numeric zero:
                    caseRow[i] = "0";
                } else {
                    try {
                        Double testDoubleValue = new Double(valueTokens[i]);
                        caseRow[i] = testDoubleValue.toString();
                    } catch (Exception ex) {
                        throw new IOException("Failed to parse a value recognized as numeric in the first pass! column: " + i + ", value: " + valueTokens[i]);
                    }
                }
            } else {
                if (valueTokens[i] != null && !valueTokens[i].equals(".")) {
                    String charToken = valueTokens[i];
                    // Dealing with quotes:
                    // remove the leading and trailing quotes, if present:
                    charToken = charToken.replaceFirst("^\"", "");
                    charToken = charToken.replaceFirst("\"$", "");
                    // escape the remaining ones:
                    charToken = charToken.replace("\"", "\\\"");
                    // final pair of quotes:
                    charToken = "\"" + charToken + "\"";
                    caseRow[i] = charToken;
                } else {
                    caseRow[i] = "";
                }
            }
        }
        finalWriter.println(StringUtils.join(caseRow, "\t"));
        lineCounter++;
    }
    secondPassReader.close();
    finalWriter.close();
    if (dataTable.getCaseQuantity().intValue() != lineCounter) {
        throw new IOException("Mismatch between line counts in first and final passes!");
    }
    dataTable.setUnf("UNF:6:NOTCALCULATED");
    ingesteddata.setTabDelimitedFile(tabFileDestination);
    ingesteddata.setDataTable(dataTable);
    dbglog.fine("Produced temporary file " + ingesteddata.getTabDelimitedFile().getAbsolutePath());
    dbglog.fine("Found " + dataTable.getVarQuantity() + " variables, " + dataTable.getCaseQuantity() + " observations.");
    String varNames = null;
    for (int i = 0; i < dataTable.getVarQuantity().intValue(); i++) {
        if (varNames == null) {
            varNames = dataTable.getDataVariables().get(i).getName();
        } else {
            varNames = varNames + ", " + dataTable.getDataVariables().get(i).getName();
        }
    }
    dbglog.fine("Variable names: " + varNames);
    return ingesteddata;
}
Also used : DataTable(edu.harvard.iq.dataverse.DataTable) TabularDataFileReader(edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader) FileReader(java.io.FileReader) TabularDataIngest(edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest) XSSFRichTextString(org.apache.poi.xssf.usermodel.XSSFRichTextString) NamingException(javax.naming.NamingException) SAXException(org.xml.sax.SAXException)

Example 3 with DataTable

use of edu.harvard.iq.dataverse.DataTable in project dataverse by IQSS.

the class XLSXFileReader method main.

public static void main(String[] args) throws Exception {
    XLSXFileReader testReader = new XLSXFileReader(new XLSXFileReaderSpi());
    DataTable dataTable;
    BufferedInputStream xlsxInputStream = new BufferedInputStream(new FileInputStream(new File(args[0])));
    TabularDataIngest dataIngest = testReader.read(xlsxInputStream, null);
    dataTable = dataIngest.getDataTable();
    System.out.println("Produced temporary file " + dataIngest.getTabDelimitedFile().getAbsolutePath());
    System.out.println("Found " + dataTable.getVarQuantity() + " variables, " + dataTable.getCaseQuantity() + " observations.");
    System.out.println("Variable names:");
    for (int i = 0; i < dataTable.getVarQuantity().intValue(); i++) {
        System.out.println(dataTable.getDataVariables().get(i).getName());
    }
}
Also used : DataTable(edu.harvard.iq.dataverse.DataTable) TabularDataIngest(edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest)

Example 4 with DataTable

use of edu.harvard.iq.dataverse.DataTable in project dataverse by IQSS.

the class IngestUtilTest method testCheckForDuplicateFileNamesTabular.

@Test
public /**
 * Test tabular files (e.g., .dta) are changed when .tab files with the same
 * name exist.
 */
void testCheckForDuplicateFileNamesTabular() throws Exception {
    SimpleDateFormat dateFmt = new SimpleDateFormat("yyyyMMdd");
    // create dataset
    Dataset dataset = makeDataset();
    // create dataset version
    DatasetVersion datasetVersion = dataset.getEditVersion();
    datasetVersion.setCreateTime(dateFmt.parse("20001012"));
    datasetVersion.setLastUpdateTime(datasetVersion.getLastUpdateTime());
    datasetVersion.setId(MocksFactory.nextId());
    datasetVersion.setReleaseTime(dateFmt.parse("20010101"));
    datasetVersion.setVersionState(DatasetVersion.VersionState.RELEASED);
    datasetVersion.setMinorVersionNumber(0L);
    datasetVersion.setVersionNumber(1L);
    datasetVersion.setFileMetadatas(new ArrayList<>());
    // create datafiles
    List<DataFile> dataFileList = new ArrayList<>();
    DataFile datafile1 = new DataFile("application/x-strata");
    datafile1.setStorageIdentifier("foobar.dta");
    datafile1.setFilesize(200);
    datafile1.setModificationTime(new Timestamp(new Date().getTime()));
    datafile1.setCreateDate(new Timestamp(new Date().getTime()));
    datafile1.setPermissionModificationTime(new Timestamp(new Date().getTime()));
    datafile1.setOwner(dataset);
    datafile1.setIngestDone();
    datafile1.setChecksumType(DataFile.ChecksumType.SHA1);
    datafile1.setChecksumValue("Unknown");
    DataTable dt1 = new DataTable();
    dt1.setOriginalFileFormat("application/x-stata");
    datafile1.setDataTable(dt1);
    // set metadata and add version
    FileMetadata fmd1 = new FileMetadata();
    fmd1.setId(1L);
    fmd1.setLabel("foobar.tab");
    fmd1.setDataFile(datafile1);
    datafile1.getFileMetadatas().add(fmd1);
    datasetVersion.getFileMetadatas().add(fmd1);
    fmd1.setDatasetVersion(datasetVersion);
    DataFile datafile2 = new DataFile("application/x-strata");
    datafile2.setStorageIdentifier("foobar.dta");
    datafile2.setFilesize(200);
    datafile2.setModificationTime(new Timestamp(new Date().getTime()));
    datafile2.setCreateDate(new Timestamp(new Date().getTime()));
    datafile2.setPermissionModificationTime(new Timestamp(new Date().getTime()));
    datafile2.setOwner(dataset);
    datafile2.setIngestDone();
    datafile2.setChecksumType(DataFile.ChecksumType.SHA1);
    datafile2.setChecksumValue("Unknown");
    DataTable dt2 = new DataTable();
    dt2.setOriginalFileFormat("application/x-stata");
    datafile2.setDataTable(dt2);
    // set metadata and add version
    FileMetadata fmd2 = new FileMetadata();
    fmd2.setId(2L);
    fmd2.setLabel("foobar.dta");
    fmd2.setDataFile(datafile2);
    datafile2.getFileMetadatas().add(fmd2);
    dataFileList.add(datafile2);
    IngestUtil.checkForDuplicateFileNamesFinal(datasetVersion, dataFileList);
    boolean file2NameAltered = false;
    for (DataFile df : dataFileList) {
        if (df.getFileMetadata().getLabel().equals("foobar-1.dta")) {
            file2NameAltered = true;
        }
    }
    // check filename is altered since tabular and will change to .tab after ingest
    assertEquals(file2NameAltered, true);
}
Also used : DataFile(edu.harvard.iq.dataverse.DataFile) DataTable(edu.harvard.iq.dataverse.DataTable) Dataset(edu.harvard.iq.dataverse.Dataset) MocksFactory.makeDataset(edu.harvard.iq.dataverse.mocks.MocksFactory.makeDataset) ArrayList(java.util.ArrayList) FileMetadata(edu.harvard.iq.dataverse.FileMetadata) DatasetVersion(edu.harvard.iq.dataverse.DatasetVersion) SimpleDateFormat(java.text.SimpleDateFormat) Timestamp(java.sql.Timestamp) Date(java.util.Date) Test(org.junit.Test)

Example 5 with DataTable

use of edu.harvard.iq.dataverse.DataTable in project dataverse by IQSS.

the class CSVFileReaderTest method testSubset.

/*
     * This test will read a CSV file, then attempt to subset
     * the resulting tab-delimited file and verify that the individual variable vectors
     * are legit.
     */
@Test
public void testSubset() {
    String testFile = "src/test/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/csv/election_precincts.csv";
    Long expectedNumberOfVariables = 13L;
    // aka the number of lines in the TAB file produced by the ingest plugin
    Long expectedNumberOfCases = 24L;
    TabularDataIngest ingestResult = null;
    File generatedTabFile = null;
    DataTable generatedDataTable = null;
    try (BufferedInputStream stream = new BufferedInputStream(new FileInputStream(testFile))) {
        CSVFileReader instance = new CSVFileReader(new CSVFileReaderSpi());
        ingestResult = instance.read(stream, null);
        generatedTabFile = ingestResult.getTabDelimitedFile();
        generatedDataTable = ingestResult.getDataTable();
    } catch (IOException ex) {
        fail("" + ex);
    }
    assertNotNull(generatedDataTable);
    assertNotNull(generatedDataTable.getDataVariables());
    assertEquals(generatedDataTable.getVarQuantity(), new Long(generatedDataTable.getDataVariables().size()));
    assertEquals(generatedDataTable.getVarQuantity(), expectedNumberOfVariables);
    assertEquals(expectedNumberOfCases, generatedDataTable.getCaseQuantity());
    // And now let's try and subset the individual vectors
    // First, the "continuous" vectors (we should be able to read these as Double[]):
    int[] floatColumns = { 2 };
    Double[][] floatVectors = { { 1.0, 3.0, 4.0, 6.0, 7.0, 8.0, 11.0, 12.0, 76.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0 } };
    int vectorCount = 0;
    for (int i : floatColumns) {
        if (!generatedDataTable.getDataVariables().get(i).isIntervalContinuous()) {
            fail("Column " + i + " was not properly processed as \"continuous\"");
        }
        FileInputStream generatedTabInputStream = null;
        try {
            generatedTabInputStream = new FileInputStream(generatedTabFile);
        } catch (FileNotFoundException ioex) {
            fail("Failed to open generated tab-delimited file for reading" + ioex);
        }
        Double[] columnVector = TabularSubsetGenerator.subsetDoubleVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue());
        assertArrayEquals("column " + i + ":", floatVectors[vectorCount++], columnVector);
    }
    // Discrete Numerics (aka, integers):
    int[] integerColumns = { 1, 4, 6, 7, 8, 9, 10, 11, 12 };
    Long[][] longVectors = { { 1L, 3L, 4L, 6L, 7L, 8L, 11L, 12L, 76L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L, 77L }, { 1L, 2L, 3L, 4L, 5L, 11L, 13L, 15L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L }, { 85729227L, 85699791L, 640323976L, 85695847L, 637089796L, 637089973L, 85695001L, 85695077L, 1111111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L }, { 205871733L, 205871735L, 205871283L, 258627915L, 257444575L, 205871930L, 260047422L, 262439738L, 1111111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L }, { 205871673L, 205871730L, 205871733L, 205872857L, 258627915L, 257444584L, 205873413L, 262439738L, 1111111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L }, { 25025000201L, 25025081001L, 25025000701L, 25025050901L, 25025040600L, 25025000502L, 25025040401L, 25025100900L, 1111111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L }, { 250250502002L, 250250502003L, 250250501013L, 250250408011L, 250250503001L, 250250103001L, 250250406002L, 250250406001L, 1111111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L }, { 250251011024001L, 250251011013003L, 250251304041007L, 250251011013006L, 250251010016000L, 250251011024002L, 250251001005004L, 250251002003002L, 1111111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L }, { 2109L, 2110L, 2111L, 2120L, 2121L, 2115L, 2116L, 2122L, 11111L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L, 4444444L } };
    vectorCount = 0;
    for (int i : integerColumns) {
        if (!generatedDataTable.getDataVariables().get(i).isIntervalDiscrete() || !generatedDataTable.getDataVariables().get(i).isTypeNumeric()) {
            fail("Column " + i + " was not properly processed as \"discrete numeric\"");
        }
        FileInputStream generatedTabInputStream = null;
        try {
            generatedTabInputStream = new FileInputStream(generatedTabFile);
        } catch (FileNotFoundException ioex) {
            fail("Failed to open generated tab-delimited file for reading" + ioex);
        }
        Long[] columnVector = TabularSubsetGenerator.subsetLongVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue());
        assertArrayEquals("column " + i + ":", longVectors[vectorCount++], columnVector);
    }
    // And finally, Strings:
    int[] stringColumns = { 0, 3, 5 };
    String[][] stringVectors = { { "Dog", "Squirrel", "Antelope", "Zebra", "Lion", "Gazelle", "Cat", "Giraffe", "Cat", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey", "Donkey" }, { "East Boston", "Charlestown", "South Boston", "Bronx", "Roslindale", "Mission Hill", "Jamaica Plain", "Hyde Park", "Fenway/Kenmore", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens", "Queens" }, { "2-06", "1-09", "1-1A", "1-1B", "2-04", "3-05", "1-1C", "1-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A", "41-10A" } };
    vectorCount = 0;
    for (int i : stringColumns) {
        if (!generatedDataTable.getDataVariables().get(i).isTypeCharacter()) {
            fail("Column " + i + " was not properly processed as a character vector");
        }
        FileInputStream generatedTabInputStream = null;
        try {
            generatedTabInputStream = new FileInputStream(generatedTabFile);
        } catch (FileNotFoundException ioex) {
            fail("Failed to open generated tab-delimited file for reading" + ioex);
        }
        String[] columnVector = TabularSubsetGenerator.subsetStringVector(generatedTabInputStream, i, generatedDataTable.getCaseQuantity().intValue());
        assertArrayEquals("column " + i + ":", stringVectors[vectorCount++], columnVector);
    }
}
Also used : DataTable(edu.harvard.iq.dataverse.DataTable) FileNotFoundException(java.io.FileNotFoundException) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) BufferedInputStream(java.io.BufferedInputStream) TabularDataIngest(edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest) File(java.io.File) Test(org.junit.Test)

Aggregations

DataTable (edu.harvard.iq.dataverse.DataTable)16 TabularDataIngest (edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest)7 DataFile (edu.harvard.iq.dataverse.DataFile)6 IOException (java.io.IOException)6 Test (org.junit.Test)6 FileMetadata (edu.harvard.iq.dataverse.FileMetadata)5 BufferedInputStream (java.io.BufferedInputStream)5 File (java.io.File)5 FileInputStream (java.io.FileInputStream)5 FileNotFoundException (java.io.FileNotFoundException)4 ArrayList (java.util.ArrayList)4 TabularDataFileReader (edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader)3 Dataset (edu.harvard.iq.dataverse.Dataset)2 DatasetVersion (edu.harvard.iq.dataverse.DatasetVersion)2 DataVariable (edu.harvard.iq.dataverse.datavariable.DataVariable)2 MocksFactory.makeDataset (edu.harvard.iq.dataverse.mocks.MocksFactory.makeDataset)2 ApiToken (edu.harvard.iq.dataverse.authorization.users.ApiToken)1 VariableInterval (edu.harvard.iq.dataverse.datavariable.DataVariable.VariableInterval)1 VariableType (edu.harvard.iq.dataverse.datavariable.DataVariable.VariableType)1 BufferedReader (java.io.BufferedReader)1