Search in sources :

Example 56 with DataFile

use of edu.harvard.iq.dataverse.DataFile in project dataverse by IQSS.

the class IngestServiceBean method ingestAsTabular.

public boolean ingestAsTabular(Long datafile_id) {
    // DataFile dataFile) throws IOException {
    DataFile dataFile = fileService.find(datafile_id);
    boolean ingestSuccessful = false;
    // Locate ingest plugin for the file format by looking
    // it up with the Ingest Service Provider Registry:
    String fileName = dataFile.getFileMetadata().getLabel();
    TabularDataFileReader ingestPlugin = getTabDataReaderByMimeType(dataFile.getContentType());
    if (ingestPlugin == null) {
        dataFile.SetIngestProblem();
        FileUtil.createIngestFailureReport(dataFile, "No ingest plugin found for file type " + dataFile.getContentType());
        dataFile = fileService.save(dataFile);
        logger.warning("Ingest failure.");
        return false;
    }
    BufferedInputStream inputStream = null;
    File additionalData = null;
    StorageIO<DataFile> storageIO = null;
    try {
        storageIO = dataFile.getStorageIO();
        storageIO.open();
        if (storageIO.isLocalFile()) {
            inputStream = new BufferedInputStream(storageIO.getInputStream());
        } else {
            ReadableByteChannel dataFileChannel = storageIO.getReadChannel();
            File tempFile = File.createTempFile("tempIngestSourceFile", ".tmp");
            FileChannel tempIngestSourceChannel = new FileOutputStream(tempFile).getChannel();
            tempIngestSourceChannel.transferFrom(dataFileChannel, 0, storageIO.getSize());
            inputStream = new BufferedInputStream(new FileInputStream(tempFile));
            logger.fine("Saved " + storageIO.getSize() + " bytes in a local temp file.");
        }
    } catch (IOException ioEx) {
        dataFile.SetIngestProblem();
        FileUtil.createIngestFailureReport(dataFile, "IO Exception occured while trying to open the file for reading.");
        dataFile = fileService.save(dataFile);
        logger.warning("Ingest failure (No file produced).");
        return false;
    }
    IngestRequest ingestRequest = dataFile.getIngestRequest();
    if (ingestRequest != null) {
        if (ingestRequest.getTextEncoding() != null && !ingestRequest.getTextEncoding().equals("")) {
            logger.fine("Setting language encoding to " + ingestRequest.getTextEncoding());
            ingestPlugin.setDataLanguageEncoding(ingestRequest.getTextEncoding());
        }
        if (ingestRequest.getLabelsFile() != null) {
            additionalData = new File(ingestRequest.getLabelsFile());
        }
    }
    TabularDataIngest tabDataIngest = null;
    try {
        if (additionalData != null) {
            tabDataIngest = ingestPlugin.read(inputStream, additionalData);
        } else {
            tabDataIngest = ingestPlugin.read(inputStream, null);
        }
    } catch (IOException ingestEx) {
        dataFile.SetIngestProblem();
        FileUtil.createIngestFailureReport(dataFile, ingestEx.getMessage());
        dataFile = fileService.save(dataFile);
        dataFile = fileService.save(dataFile);
        logger.fine("Ingest failure (IO Exception): " + ingestEx.getMessage() + ".");
        return false;
    } catch (Exception unknownEx) {
        // this is a bit of a kludge, to make sure no unknown exceptions are
        // left uncaught.
        dataFile.SetIngestProblem();
        FileUtil.createIngestFailureReport(dataFile, unknownEx.getMessage());
        dataFile = fileService.save(dataFile);
        dataFile = fileService.save(dataFile);
        logger.warning("Ingest failure (Exception " + unknownEx.getClass() + "): " + unknownEx.getMessage() + ".");
        return false;
    }
    String originalContentType = dataFile.getContentType();
    String originalFileName = dataFile.getFileMetadata().getLabel();
    long originalFileSize = dataFile.getFilesize();
    boolean postIngestTasksSuccessful = false;
    boolean databaseSaveSuccessful = false;
    if (tabDataIngest != null) {
        File tabFile = tabDataIngest.getTabDelimitedFile();
        if (tabDataIngest.getDataTable() != null && tabFile != null && tabFile.exists()) {
            logger.info("Tabular data successfully ingested; DataTable with " + tabDataIngest.getDataTable().getVarQuantity() + " variables produced.");
            logger.info("Tab-delimited file produced: " + tabFile.getAbsolutePath());
            dataFile.setFilesize(tabFile.length());
            // and change the mime type to "tabular" on the final datafile,
            // and replace (or add) the extension ".tab" to the filename:
            dataFile.setContentType(FileUtil.MIME_TYPE_TAB);
            IngestUtil.modifyExistingFilename(dataFile.getOwner().getLatestVersion(), dataFile.getFileMetadata(), FileUtil.replaceExtension(fileName, "tab"));
            if (FileUtil.MIME_TYPE_CSV_ALT.equals(dataFile.getContentType())) {
                tabDataIngest.getDataTable().setOriginalFileFormat(FileUtil.MIME_TYPE_CSV);
            } else {
                tabDataIngest.getDataTable().setOriginalFileFormat(originalContentType);
            }
            dataFile.setDataTable(tabDataIngest.getDataTable());
            tabDataIngest.getDataTable().setDataFile(dataFile);
            try {
                produceSummaryStatistics(dataFile, tabFile);
                postIngestTasksSuccessful = true;
            } catch (IOException postIngestEx) {
                dataFile.SetIngestProblem();
                FileUtil.createIngestFailureReport(dataFile, "Ingest failed to produce Summary Statistics and/or UNF signatures; " + postIngestEx.getMessage());
                restoreIngestedDataFile(dataFile, tabDataIngest, originalFileSize, originalFileName, originalContentType);
                dataFile = fileService.save(dataFile);
                logger.warning("Ingest failure: post-ingest tasks.");
            }
            if (!postIngestTasksSuccessful) {
                return false;
            }
            dataFile.setIngestDone();
            // delete the ingest request, if exists:
            if (dataFile.getIngestRequest() != null) {
                dataFile.getIngestRequest().setDataFile(null);
                dataFile.setIngestRequest(null);
            }
            try {
                /* 
                         In order to test a database save failure, uncomment this:
                        
                        if (true) {
                            throw new EJBException("Deliberate database save failure");
                        }
                     */
                dataFile = fileService.save(dataFile);
                databaseSaveSuccessful = true;
                logger.fine("Ingest (" + dataFile.getFileMetadata().getLabel() + ".");
                if (additionalData != null) {
                    // remove the extra tempfile, if there was one:
                    additionalData.delete();
                }
            } catch (Exception unknownEx) {
                // this means that an error occurred while saving the datafile
                // in the database.
                logger.warning("Ingest failure: Failed to save tabular metadata (datatable, datavariables, etc.) in the database. Clearing the datafile object.");
                dataFile = null;
                dataFile = fileService.find(datafile_id);
                if (dataFile != null) {
                    dataFile.SetIngestProblem();
                    FileUtil.createIngestFailureReport(dataFile, "Ingest produced tabular data, but failed to save it in the database; " + unknownEx.getMessage() + " No further information is available.");
                    restoreIngestedDataFile(dataFile, tabDataIngest, originalFileSize, originalFileName, originalContentType);
                    dataFile = fileService.save(dataFile);
                }
            }
            if (!databaseSaveSuccessful) {
                return false;
            }
            // Finally, let's swap the original and the tabular files:
            try {
                /* Start of save as backup */
                StorageIO<DataFile> dataAccess = dataFile.getStorageIO();
                dataAccess.open();
                // and we want to save the original of the ingested file:
                try {
                    dataAccess.backupAsAux(FileUtil.SAVED_ORIGINAL_FILENAME_EXTENSION);
                    logger.fine("Saved the ingested original as a backup aux file " + FileUtil.SAVED_ORIGINAL_FILENAME_EXTENSION);
                } catch (IOException iox) {
                    logger.warning("Failed to save the ingested original! " + iox.getMessage());
                }
                // Replace contents of the file with the tab-delimited data produced:
                dataAccess.savePath(Paths.get(tabFile.getAbsolutePath()));
                // Reset the file size:
                dataFile.setFilesize(dataAccess.getSize());
                // delete the temp tab-file:
                tabFile.delete();
            /*end of save as backup */
            } catch (Exception e) {
                // this probably means that an error occurred while saving the file to the file system
                logger.warning("Failed to save the tabular file produced by the ingest (resetting the ingested DataFile back to its original state)");
                dataFile = null;
                dataFile = fileService.find(datafile_id);
                if (dataFile != null) {
                    dataFile.SetIngestProblem();
                    FileUtil.createIngestFailureReport(dataFile, "Failed to save the tabular file produced by the ingest.");
                    restoreIngestedDataFile(dataFile, tabDataIngest, originalFileSize, originalFileName, originalContentType);
                    dataFile = fileService.save(dataFile);
                }
            }
            ingestSuccessful = true;
        }
    } else {
        logger.warning("Ingest failed to produce data obect.");
    }
    return ingestSuccessful;
}
Also used : TabularDataFileReader(edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader) ReadableByteChannel(java.nio.channels.ReadableByteChannel) FileChannel(java.nio.channels.FileChannel) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) FileExceedsMaxSizeException(edu.harvard.iq.dataverse.datasetutility.FileExceedsMaxSizeException) UnsupportedDataAccessOperationException(edu.harvard.iq.dataverse.dataaccess.UnsupportedDataAccessOperationException) ParseException(java.text.ParseException) JMSException(javax.jms.JMSException) FileNotFoundException(java.io.FileNotFoundException) EJBException(javax.ejb.EJBException) IOException(java.io.IOException) DataFile(edu.harvard.iq.dataverse.DataFile) BufferedInputStream(java.io.BufferedInputStream) FileOutputStream(java.io.FileOutputStream) TabularDataIngest(edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest) DataFile(edu.harvard.iq.dataverse.DataFile) File(java.io.File)

Example 57 with DataFile

use of edu.harvard.iq.dataverse.DataFile in project dataverse by IQSS.

the class IngestServiceBean method produceContinuousSummaryStatistics.

public void produceContinuousSummaryStatistics(DataFile dataFile, File generatedTabularFile) throws IOException {
    for (int i = 0; i < dataFile.getDataTable().getVarQuantity(); i++) {
        if (dataFile.getDataTable().getDataVariables().get(i).isIntervalContinuous()) {
            logger.fine("subsetting continuous vector");
            StorageIO<DataFile> storageIO = dataFile.getStorageIO();
            storageIO.open();
            if ("float".equals(dataFile.getDataTable().getDataVariables().get(i).getFormat())) {
                Float[] variableVector = TabularSubsetGenerator.subsetFloatVector(new FileInputStream(generatedTabularFile), i, dataFile.getDataTable().getCaseQuantity().intValue());
                logger.fine("Calculating summary statistics on a Float vector;");
                calculateContinuousSummaryStatistics(dataFile, i, variableVector);
                // calculate the UNF while we are at it:
                logger.fine("Calculating UNF on a Float vector;");
                calculateUNF(dataFile, i, variableVector);
                variableVector = null;
            } else {
                Double[] variableVector = TabularSubsetGenerator.subsetDoubleVector(new FileInputStream(generatedTabularFile), i, dataFile.getDataTable().getCaseQuantity().intValue());
                logger.fine("Calculating summary statistics on a Double vector;");
                calculateContinuousSummaryStatistics(dataFile, i, variableVector);
                // calculate the UNF while we are at it:
                logger.fine("Calculating UNF on a Double vector;");
                calculateUNF(dataFile, i, variableVector);
                variableVector = null;
            }
            logger.fine("Done! (continuous);");
        }
    }
}
Also used : DataFile(edu.harvard.iq.dataverse.DataFile) FileInputStream(java.io.FileInputStream)

Example 58 with DataFile

use of edu.harvard.iq.dataverse.DataFile in project dataverse by IQSS.

the class IngestServiceBean method addFilesToDataset.

// addFilesToDataset() takes a list of new DataFiles and attaches them to the parent
// Dataset (the files are attached to the dataset, and the fileMetadatas to the
// supplied version).
public void addFilesToDataset(DatasetVersion version, List<DataFile> newFiles) {
    if (newFiles != null && newFiles.size() > 0) {
        Dataset dataset = version.getDataset();
        for (DataFile dataFile : newFiles) {
            // These are all brand new files, so they should all have
            // one filemetadata total. -- L.A.
            FileMetadata fileMetadata = dataFile.getFileMetadatas().get(0);
            String fileName = fileMetadata.getLabel();
            // Attach the file to the dataset and to the version:
            dataFile.setOwner(dataset);
            version.getFileMetadatas().add(dataFile.getFileMetadata());
            dataFile.getFileMetadata().setDatasetVersion(version);
            dataset.getFiles().add(dataFile);
        }
    }
}
Also used : DataFile(edu.harvard.iq.dataverse.DataFile) Dataset(edu.harvard.iq.dataverse.Dataset) FileMetadata(edu.harvard.iq.dataverse.FileMetadata)

Example 59 with DataFile

use of edu.harvard.iq.dataverse.DataFile in project dataverse by IQSS.

the class IngestServiceBean method produceDiscreteNumericSummaryStatistics.

public void produceDiscreteNumericSummaryStatistics(DataFile dataFile, File generatedTabularFile) throws IOException {
    for (int i = 0; i < dataFile.getDataTable().getVarQuantity(); i++) {
        if (dataFile.getDataTable().getDataVariables().get(i).isIntervalDiscrete() && dataFile.getDataTable().getDataVariables().get(i).isTypeNumeric()) {
            logger.fine("subsetting discrete-numeric vector");
            StorageIO<DataFile> storageIO = dataFile.getStorageIO();
            storageIO.open();
            Long[] variableVector = TabularSubsetGenerator.subsetLongVector(new FileInputStream(generatedTabularFile), i, dataFile.getDataTable().getCaseQuantity().intValue());
            // We are discussing calculating the same summary stats for
            // all numerics (the same kind of sumstats that we've been calculating
            // for numeric continuous type)  -- L.A. Jul. 2014
            calculateContinuousSummaryStatistics(dataFile, i, variableVector);
            // calculate the UNF while we are at it:
            logger.fine("Calculating UNF on a Long vector");
            calculateUNF(dataFile, i, variableVector);
            logger.fine("Done! (discrete numeric)");
            variableVector = null;
        }
    }
}
Also used : DataFile(edu.harvard.iq.dataverse.DataFile) FileInputStream(java.io.FileInputStream)

Example 60 with DataFile

use of edu.harvard.iq.dataverse.DataFile in project dataverse by IQSS.

the class IngestUtil method checkForDuplicateFileNamesFinal.

/**
 * Checks a list of new data files for duplicate names, renaming any
 * duplicates to ensure that they are unique.
 *
 * @param version the dataset version
 * @param newFiles the list of new data files to add to it
 */
public static void checkForDuplicateFileNamesFinal(DatasetVersion version, List<DataFile> newFiles) {
    // Step 1: create list of existing path names from all FileMetadata in the DatasetVersion
    // unique path name: directoryLabel + file separator + fileLabel
    Set<String> pathNamesExisting = existingPathNamesAsSet(version);
    // Step 2: check each new DataFile against the list of path names, if a duplicate create a new unique file name
    for (Iterator<DataFile> dfIt = newFiles.iterator(); dfIt.hasNext(); ) {
        FileMetadata fm = dfIt.next().getFileMetadata();
        fm.setLabel(duplicateFilenameCheck(fm, pathNamesExisting));
    }
}
Also used : DataFile(edu.harvard.iq.dataverse.DataFile) FileMetadata(edu.harvard.iq.dataverse.FileMetadata)

Aggregations

DataFile (edu.harvard.iq.dataverse.DataFile)111 Dataset (edu.harvard.iq.dataverse.Dataset)39 IOException (java.io.IOException)39 FileMetadata (edu.harvard.iq.dataverse.FileMetadata)30 ArrayList (java.util.ArrayList)25 DatasetVersion (edu.harvard.iq.dataverse.DatasetVersion)20 File (java.io.File)20 FileNotFoundException (java.io.FileNotFoundException)18 Path (javax.ws.rs.Path)18 Dataverse (edu.harvard.iq.dataverse.Dataverse)17 FileInputStream (java.io.FileInputStream)16 AuthenticatedUser (edu.harvard.iq.dataverse.authorization.users.AuthenticatedUser)14 CommandException (edu.harvard.iq.dataverse.engine.command.exception.CommandException)13 Date (java.util.Date)13 GET (javax.ws.rs.GET)13 Test (org.junit.Test)13 Timestamp (java.sql.Timestamp)11 InputStream (java.io.InputStream)10 DataVariable (edu.harvard.iq.dataverse.datavariable.DataVariable)8 FileOutputStream (java.io.FileOutputStream)8