Search in sources :

Example 6 with TabularDataIngest

use of edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest in project dataverse by IQSS.

the class IngestServiceBean method ingestAsTabular.

public boolean ingestAsTabular(Long datafile_id) {
    // DataFile dataFile) throws IOException {
    DataFile dataFile = fileService.find(datafile_id);
    boolean ingestSuccessful = false;
    // Locate ingest plugin for the file format by looking
    // it up with the Ingest Service Provider Registry:
    String fileName = dataFile.getFileMetadata().getLabel();
    TabularDataFileReader ingestPlugin = getTabDataReaderByMimeType(dataFile.getContentType());
    if (ingestPlugin == null) {
        dataFile.SetIngestProblem();
        FileUtil.createIngestFailureReport(dataFile, "No ingest plugin found for file type " + dataFile.getContentType());
        dataFile = fileService.save(dataFile);
        logger.warning("Ingest failure.");
        return false;
    }
    BufferedInputStream inputStream = null;
    File additionalData = null;
    StorageIO<DataFile> storageIO = null;
    try {
        storageIO = dataFile.getStorageIO();
        storageIO.open();
        if (storageIO.isLocalFile()) {
            inputStream = new BufferedInputStream(storageIO.getInputStream());
        } else {
            ReadableByteChannel dataFileChannel = storageIO.getReadChannel();
            File tempFile = File.createTempFile("tempIngestSourceFile", ".tmp");
            FileChannel tempIngestSourceChannel = new FileOutputStream(tempFile).getChannel();
            tempIngestSourceChannel.transferFrom(dataFileChannel, 0, storageIO.getSize());
            inputStream = new BufferedInputStream(new FileInputStream(tempFile));
            logger.fine("Saved " + storageIO.getSize() + " bytes in a local temp file.");
        }
    } catch (IOException ioEx) {
        dataFile.SetIngestProblem();
        FileUtil.createIngestFailureReport(dataFile, "IO Exception occured while trying to open the file for reading.");
        dataFile = fileService.save(dataFile);
        logger.warning("Ingest failure (No file produced).");
        return false;
    }
    IngestRequest ingestRequest = dataFile.getIngestRequest();
    if (ingestRequest != null) {
        if (ingestRequest.getTextEncoding() != null && !ingestRequest.getTextEncoding().equals("")) {
            logger.fine("Setting language encoding to " + ingestRequest.getTextEncoding());
            ingestPlugin.setDataLanguageEncoding(ingestRequest.getTextEncoding());
        }
        if (ingestRequest.getLabelsFile() != null) {
            additionalData = new File(ingestRequest.getLabelsFile());
        }
    }
    TabularDataIngest tabDataIngest = null;
    try {
        if (additionalData != null) {
            tabDataIngest = ingestPlugin.read(inputStream, additionalData);
        } else {
            tabDataIngest = ingestPlugin.read(inputStream, null);
        }
    } catch (IOException ingestEx) {
        dataFile.SetIngestProblem();
        FileUtil.createIngestFailureReport(dataFile, ingestEx.getMessage());
        dataFile = fileService.save(dataFile);
        dataFile = fileService.save(dataFile);
        logger.fine("Ingest failure (IO Exception): " + ingestEx.getMessage() + ".");
        return false;
    } catch (Exception unknownEx) {
        // this is a bit of a kludge, to make sure no unknown exceptions are
        // left uncaught.
        dataFile.SetIngestProblem();
        FileUtil.createIngestFailureReport(dataFile, unknownEx.getMessage());
        dataFile = fileService.save(dataFile);
        dataFile = fileService.save(dataFile);
        logger.warning("Ingest failure (Exception " + unknownEx.getClass() + "): " + unknownEx.getMessage() + ".");
        return false;
    }
    String originalContentType = dataFile.getContentType();
    String originalFileName = dataFile.getFileMetadata().getLabel();
    long originalFileSize = dataFile.getFilesize();
    boolean postIngestTasksSuccessful = false;
    boolean databaseSaveSuccessful = false;
    if (tabDataIngest != null) {
        File tabFile = tabDataIngest.getTabDelimitedFile();
        if (tabDataIngest.getDataTable() != null && tabFile != null && tabFile.exists()) {
            logger.info("Tabular data successfully ingested; DataTable with " + tabDataIngest.getDataTable().getVarQuantity() + " variables produced.");
            logger.info("Tab-delimited file produced: " + tabFile.getAbsolutePath());
            dataFile.setFilesize(tabFile.length());
            // and change the mime type to "tabular" on the final datafile,
            // and replace (or add) the extension ".tab" to the filename:
            dataFile.setContentType(FileUtil.MIME_TYPE_TAB);
            IngestUtil.modifyExistingFilename(dataFile.getOwner().getLatestVersion(), dataFile.getFileMetadata(), FileUtil.replaceExtension(fileName, "tab"));
            if (FileUtil.MIME_TYPE_CSV_ALT.equals(dataFile.getContentType())) {
                tabDataIngest.getDataTable().setOriginalFileFormat(FileUtil.MIME_TYPE_CSV);
            } else {
                tabDataIngest.getDataTable().setOriginalFileFormat(originalContentType);
            }
            dataFile.setDataTable(tabDataIngest.getDataTable());
            tabDataIngest.getDataTable().setDataFile(dataFile);
            try {
                produceSummaryStatistics(dataFile, tabFile);
                postIngestTasksSuccessful = true;
            } catch (IOException postIngestEx) {
                dataFile.SetIngestProblem();
                FileUtil.createIngestFailureReport(dataFile, "Ingest failed to produce Summary Statistics and/or UNF signatures; " + postIngestEx.getMessage());
                restoreIngestedDataFile(dataFile, tabDataIngest, originalFileSize, originalFileName, originalContentType);
                dataFile = fileService.save(dataFile);
                logger.warning("Ingest failure: post-ingest tasks.");
            }
            if (!postIngestTasksSuccessful) {
                return false;
            }
            dataFile.setIngestDone();
            // delete the ingest request, if exists:
            if (dataFile.getIngestRequest() != null) {
                dataFile.getIngestRequest().setDataFile(null);
                dataFile.setIngestRequest(null);
            }
            try {
                /* 
                         In order to test a database save failure, uncomment this:
                        
                        if (true) {
                            throw new EJBException("Deliberate database save failure");
                        }
                     */
                dataFile = fileService.save(dataFile);
                databaseSaveSuccessful = true;
                logger.fine("Ingest (" + dataFile.getFileMetadata().getLabel() + ".");
                if (additionalData != null) {
                    // remove the extra tempfile, if there was one:
                    additionalData.delete();
                }
            } catch (Exception unknownEx) {
                // this means that an error occurred while saving the datafile
                // in the database.
                logger.warning("Ingest failure: Failed to save tabular metadata (datatable, datavariables, etc.) in the database. Clearing the datafile object.");
                dataFile = null;
                dataFile = fileService.find(datafile_id);
                if (dataFile != null) {
                    dataFile.SetIngestProblem();
                    FileUtil.createIngestFailureReport(dataFile, "Ingest produced tabular data, but failed to save it in the database; " + unknownEx.getMessage() + " No further information is available.");
                    restoreIngestedDataFile(dataFile, tabDataIngest, originalFileSize, originalFileName, originalContentType);
                    dataFile = fileService.save(dataFile);
                }
            }
            if (!databaseSaveSuccessful) {
                return false;
            }
            // Finally, let's swap the original and the tabular files:
            try {
                /* Start of save as backup */
                StorageIO<DataFile> dataAccess = dataFile.getStorageIO();
                dataAccess.open();
                // and we want to save the original of the ingested file:
                try {
                    dataAccess.backupAsAux(FileUtil.SAVED_ORIGINAL_FILENAME_EXTENSION);
                    logger.fine("Saved the ingested original as a backup aux file " + FileUtil.SAVED_ORIGINAL_FILENAME_EXTENSION);
                } catch (IOException iox) {
                    logger.warning("Failed to save the ingested original! " + iox.getMessage());
                }
                // Replace contents of the file with the tab-delimited data produced:
                dataAccess.savePath(Paths.get(tabFile.getAbsolutePath()));
                // Reset the file size:
                dataFile.setFilesize(dataAccess.getSize());
                // delete the temp tab-file:
                tabFile.delete();
            /*end of save as backup */
            } catch (Exception e) {
                // this probably means that an error occurred while saving the file to the file system
                logger.warning("Failed to save the tabular file produced by the ingest (resetting the ingested DataFile back to its original state)");
                dataFile = null;
                dataFile = fileService.find(datafile_id);
                if (dataFile != null) {
                    dataFile.SetIngestProblem();
                    FileUtil.createIngestFailureReport(dataFile, "Failed to save the tabular file produced by the ingest.");
                    restoreIngestedDataFile(dataFile, tabDataIngest, originalFileSize, originalFileName, originalContentType);
                    dataFile = fileService.save(dataFile);
                }
            }
            ingestSuccessful = true;
        }
    } else {
        logger.warning("Ingest failed to produce data obect.");
    }
    return ingestSuccessful;
}
Also used : TabularDataFileReader(edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader) ReadableByteChannel(java.nio.channels.ReadableByteChannel) FileChannel(java.nio.channels.FileChannel) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) FileExceedsMaxSizeException(edu.harvard.iq.dataverse.datasetutility.FileExceedsMaxSizeException) UnsupportedDataAccessOperationException(edu.harvard.iq.dataverse.dataaccess.UnsupportedDataAccessOperationException) ParseException(java.text.ParseException) JMSException(javax.jms.JMSException) FileNotFoundException(java.io.FileNotFoundException) EJBException(javax.ejb.EJBException) IOException(java.io.IOException) DataFile(edu.harvard.iq.dataverse.DataFile) BufferedInputStream(java.io.BufferedInputStream) FileOutputStream(java.io.FileOutputStream) TabularDataIngest(edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest) DataFile(edu.harvard.iq.dataverse.DataFile) File(java.io.File)

Example 7 with TabularDataIngest

use of edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest in project dataverse by IQSS.

the class IngestServiceBean method main.

public static void main(String[] args) {
    String file = args[0];
    String type = args[1];
    if (file == null || type == null || "".equals(file) || "".equals(type)) {
        System.err.println("Usage: java edu.harvard.iq.dataverse.ingest.IngestServiceBean <file> <type>.");
        System.exit(1);
    }
    BufferedInputStream fileInputStream = null;
    try {
        fileInputStream = new BufferedInputStream(new FileInputStream(new File(file)));
    } catch (FileNotFoundException notfoundEx) {
        fileInputStream = null;
    }
    if (fileInputStream == null) {
        System.err.println("Could not open file " + file + ".");
        System.exit(1);
    }
    TabularDataFileReader ingestPlugin = getTabDataReaderByMimeType(type);
    if (ingestPlugin == null) {
        System.err.println("Could not locate an ingest plugin for type " + type + ".");
        System.exit(1);
    }
    TabularDataIngest tabDataIngest = null;
    try {
        tabDataIngest = ingestPlugin.read(fileInputStream, null);
    } catch (IOException ingestEx) {
        System.err.println("Caught an exception trying to ingest file " + file + ".");
        System.exit(1);
    }
    try {
        if (tabDataIngest != null) {
            File tabFile = tabDataIngest.getTabDelimitedFile();
            if (tabDataIngest.getDataTable() != null && tabFile != null && tabFile.exists()) {
                String tabFilename = FileUtil.replaceExtension(file, "tab");
                Files.copy(Paths.get(tabFile.getAbsolutePath()), Paths.get(tabFilename), StandardCopyOption.REPLACE_EXISTING);
                DataTable dataTable = tabDataIngest.getDataTable();
                System.out.println("NVARS: " + dataTable.getVarQuantity());
                System.out.println("NOBS: " + dataTable.getCaseQuantity());
                System.out.println("UNF: " + dataTable.getUnf());
                for (int i = 0; i < dataTable.getVarQuantity(); i++) {
                    String vartype = "";
                    if (dataTable.getDataVariables().get(i).isIntervalContinuous()) {
                        vartype = "numeric-continuous";
                    } else {
                        if (dataTable.getDataVariables().get(i).isTypeNumeric()) {
                            vartype = "numeric-discrete";
                        } else {
                            vartype = "character";
                        }
                    }
                    System.out.print("VAR" + i + " ");
                    System.out.print(dataTable.getDataVariables().get(i).getName() + " ");
                    System.out.print(vartype + " ");
                    System.out.print(dataTable.getDataVariables().get(i).getUnf());
                    System.out.println();
                }
            } else {
                System.err.println("Ingest failed to produce tab file or data table for file " + file + ".");
                System.exit(1);
            }
        } else {
            System.err.println("Ingest resulted in a null tabDataIngest object for file " + file + ".");
            System.exit(1);
        }
    } catch (IOException ex) {
        System.err.println("Caught an exception trying to save ingested data for file " + file + ".");
        System.exit(1);
    }
}
Also used : TabularDataFileReader(edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader) DataTable(edu.harvard.iq.dataverse.DataTable) BufferedInputStream(java.io.BufferedInputStream) FileNotFoundException(java.io.FileNotFoundException) TabularDataIngest(edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest) IOException(java.io.IOException) DataFile(edu.harvard.iq.dataverse.DataFile) File(java.io.File) FileInputStream(java.io.FileInputStream)

Example 8 with TabularDataIngest

use of edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest in project dataverse by IQSS.

the class TestIngest method datafile.

// @EJB
@Path("test/file")
@GET
@Produces({ "text/plain" })
public String datafile(@QueryParam("fileName") String fileName, @QueryParam("fileType") String fileType, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) /*throws NotFoundException, ServiceUnavailableException, PermissionDeniedException, AuthorizationRequiredException*/
{
    String output = "";
    if (StringUtil.isEmpty(fileName) || StringUtil.isEmpty(fileType)) {
        output = output.concat("Usage: /api/ingest/test/file?fileName=PATH&fileType=TYPE");
        return output;
    }
    BufferedInputStream fileInputStream = null;
    try {
        fileInputStream = new BufferedInputStream(new FileInputStream(new File(fileName)));
    } catch (FileNotFoundException notfoundEx) {
        fileInputStream = null;
    }
    if (fileInputStream == null) {
        output = output.concat("Could not open file " + fileName + ".");
        return output;
    }
    TabularDataFileReader ingestPlugin = ingestService.getTabDataReaderByMimeType(fileType);
    if (ingestPlugin == null) {
        output = output.concat("Could not locate an ingest plugin for type " + fileType + ".");
        return output;
    }
    TabularDataIngest tabDataIngest = null;
    try {
        tabDataIngest = ingestPlugin.read(fileInputStream, null);
    } catch (IOException ingestEx) {
        output = output.concat("Caught an exception trying to ingest file " + fileName + ".");
        return output;
    }
    try {
        if (tabDataIngest != null) {
            File tabFile = tabDataIngest.getTabDelimitedFile();
            if (tabDataIngest.getDataTable() != null && tabFile != null && tabFile.exists()) {
                String tabFilename = FileUtil.replaceExtension(fileName, "tab");
                java.nio.file.Files.copy(Paths.get(tabFile.getAbsolutePath()), Paths.get(tabFilename), StandardCopyOption.REPLACE_EXISTING);
                DataTable dataTable = tabDataIngest.getDataTable();
                DataFile dataFile = new DataFile();
                dataFile.setStorageIdentifier(tabFilename);
                FileMetadata fileMetadata = new FileMetadata();
                fileMetadata.setLabel(fileName);
                dataFile.setDataTable(dataTable);
                dataTable.setDataFile(dataFile);
                fileMetadata.setDataFile(dataFile);
                dataFile.getFileMetadatas().add(fileMetadata);
                output = output.concat("NVARS: " + dataTable.getVarQuantity() + "\n");
                output = output.concat("NOBS: " + dataTable.getCaseQuantity() + "\n");
                try {
                    ingestService.produceSummaryStatistics(dataFile, tabFile);
                    output = output.concat("UNF: " + dataTable.getUnf() + "\n");
                } catch (IOException ioex) {
                    output = output.concat("UNF: failed to calculate\n" + "\n");
                }
                for (int i = 0; i < dataTable.getVarQuantity(); i++) {
                    String vartype = "";
                    // if ("continuous".equals(dataTable.getDataVariables().get(i).getVariableIntervalType().getName())) {
                    if (dataTable.getDataVariables().get(i).isIntervalContinuous()) {
                        vartype = "numeric-continuous";
                    } else {
                        if (dataTable.getDataVariables().get(i).isTypeNumeric()) {
                            vartype = "numeric-discrete";
                        } else {
                            String formatCategory = dataTable.getDataVariables().get(i).getFormatCategory();
                            if ("time".equals(formatCategory)) {
                                vartype = "character-time";
                            } else if ("date".equals(formatCategory)) {
                                vartype = "character-date";
                            } else {
                                vartype = "character";
                            }
                        }
                    }
                    output = output.concat("VAR" + i + " ");
                    output = output.concat(dataTable.getDataVariables().get(i).getName() + " ");
                    output = output.concat(vartype + " ");
                    output = output.concat(dataTable.getDataVariables().get(i).getUnf());
                    output = output.concat("\n");
                }
            } else {
                output = output.concat("Ingest failed to produce tab file or data table for file " + fileName + ".");
                return output;
            }
        } else {
            output = output.concat("Ingest resulted in a null tabDataIngest object for file " + fileName + ".");
            return output;
        }
    } catch (IOException ex) {
        output = output.concat("Caught an exception trying to save ingested data for file " + fileName + ".");
        return output;
    }
    return output;
}
Also used : DataFile(edu.harvard.iq.dataverse.DataFile) TabularDataFileReader(edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader) DataTable(edu.harvard.iq.dataverse.DataTable) BufferedInputStream(java.io.BufferedInputStream) FileNotFoundException(java.io.FileNotFoundException) FileMetadata(edu.harvard.iq.dataverse.FileMetadata) TabularDataIngest(edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest) IOException(java.io.IOException) DataFile(edu.harvard.iq.dataverse.DataFile) File(java.io.File) FileInputStream(java.io.FileInputStream) Path(javax.ws.rs.Path) Produces(javax.ws.rs.Produces) GET(javax.ws.rs.GET)

Aggregations

TabularDataIngest (edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest)8 DataTable (edu.harvard.iq.dataverse.DataTable)7 File (java.io.File)6 IOException (java.io.IOException)6 BufferedInputStream (java.io.BufferedInputStream)5 FileInputStream (java.io.FileInputStream)5 FileNotFoundException (java.io.FileNotFoundException)5 TabularDataFileReader (edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader)4 DataFile (edu.harvard.iq.dataverse.DataFile)3 Test (org.junit.Test)2 FileMetadata (edu.harvard.iq.dataverse.FileMetadata)1 UnsupportedDataAccessOperationException (edu.harvard.iq.dataverse.dataaccess.UnsupportedDataAccessOperationException)1 FileExceedsMaxSizeException (edu.harvard.iq.dataverse.datasetutility.FileExceedsMaxSizeException)1 BufferedReader (java.io.BufferedReader)1 FileOutputStream (java.io.FileOutputStream)1 FileReader (java.io.FileReader)1 InputStreamReader (java.io.InputStreamReader)1 PrintWriter (java.io.PrintWriter)1 FileChannel (java.nio.channels.FileChannel)1 ReadableByteChannel (java.nio.channels.ReadableByteChannel)1