Search in sources :

Example 1 with BagGenerator

use of edu.harvard.iq.dataverse.util.bagit.BagGenerator in project dataverse by IQSS.

the class LocalSubmitToArchiveCommand method performArchiveSubmission.

@Override
public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map<String, String> requestedSettings) {
    logger.fine("In LocalCloudSubmitToArchive...");
    String localPath = requestedSettings.get(":BagItLocalPath");
    String zipName = null;
    try {
        Dataset dataset = dv.getDataset();
        if (dataset.getLockFor(Reason.finalizePublication) == null && dataset.getLockFor(Reason.FileValidationFailed) == null) {
            String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-').replace('.', '-').toLowerCase();
            DataCitation dc = new DataCitation(dv);
            Map<String, String> metadata = dc.getDataCiteMetadata();
            String dataciteXml = DOIDataCiteRegisterService.getMetadataFromDvObject(dv.getDataset().getGlobalId().asString(), metadata, dv.getDataset());
            FileUtils.writeStringToFile(new File(localPath + "/" + spaceName + "-datacite.v" + dv.getFriendlyVersionNumber() + ".xml"), dataciteXml, StandardCharsets.UTF_8);
            BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml);
            bagger.setAuthenticationKey(token.getTokenString());
            zipName = localPath + "/" + spaceName + "v" + dv.getFriendlyVersionNumber() + ".zip";
            bagger.generateBag(new FileOutputStream(zipName + ".partial"));
            File srcFile = new File(zipName + ".partial");
            File destFile = new File(zipName);
            if (srcFile.renameTo(destFile)) {
                logger.fine("Localhost Submission step: Content Transferred");
                dv.setArchivalCopyLocation("file://" + zipName);
            } else {
                logger.warning("Unable to move " + zipName + ".partial to " + zipName);
            }
        } else {
            logger.warning("Localhost Submision Workflow aborted: Dataset locked for finalizePublication, or because file validation failed");
            return new Failure("Dataset locked");
        }
    } catch (Exception e) {
        logger.warning("Failed to archive " + zipName + " : " + e.getLocalizedMessage());
        e.printStackTrace();
    }
    return WorkflowStepResult.OK;
}
Also used : Dataset(edu.harvard.iq.dataverse.Dataset) BagGenerator(edu.harvard.iq.dataverse.util.bagit.BagGenerator) FileOutputStream(java.io.FileOutputStream) DataCitation(edu.harvard.iq.dataverse.DataCitation) OREMap(edu.harvard.iq.dataverse.util.bagit.OREMap) File(java.io.File) Failure(edu.harvard.iq.dataverse.workflow.step.Failure)

Example 2 with BagGenerator

use of edu.harvard.iq.dataverse.util.bagit.BagGenerator in project dataverse by IQSS.

the class DuraCloudSubmitToArchiveCommand method performArchiveSubmission.

@Override
public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map<String, String> requestedSettings) {
    String port = requestedSettings.get(DURACLOUD_PORT) != null ? requestedSettings.get(DURACLOUD_PORT) : DEFAULT_PORT;
    String dpnContext = requestedSettings.get(DURACLOUD_CONTEXT) != null ? requestedSettings.get(DURACLOUD_CONTEXT) : DEFAULT_CONTEXT;
    String host = requestedSettings.get(DURACLOUD_HOST);
    if (host != null) {
        Dataset dataset = dv.getDataset();
        if (dataset.getLockFor(Reason.finalizePublication) == null && dataset.getLockFor(Reason.FileValidationFailed) == null) {
            // Use Duracloud client classes to login
            ContentStoreManager storeManager = new ContentStoreManagerImpl(host, port, dpnContext);
            Credential credential = new Credential(System.getProperty("duracloud.username"), System.getProperty("duracloud.password"));
            storeManager.login(credential);
            String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-').replace('.', '-').toLowerCase();
            ContentStore store;
            try {
                /*
                     * If there is a failure in creating a space, it is likely that a prior version
                     * has not been fully processed (snapshot created, archiving completed and files
                     * and space deleted - currently manual operations done at the project's
                     * duracloud website)
                     */
                store = storeManager.getPrimaryContentStore();
                // Create space to copy archival files to
                store.createSpace(spaceName);
                DataCitation dc = new DataCitation(dv);
                Map<String, String> metadata = dc.getDataCiteMetadata();
                String dataciteXml = DOIDataCiteRegisterService.getMetadataFromDvObject(dv.getDataset().getGlobalId().asString(), metadata, dv.getDataset());
                MessageDigest messageDigest = MessageDigest.getInstance("MD5");
                try (PipedInputStream dataciteIn = new PipedInputStream();
                    DigestInputStream digestInputStream = new DigestInputStream(dataciteIn, messageDigest)) {
                    // Add datacite.xml file
                    new Thread(new Runnable() {

                        public void run() {
                            try (PipedOutputStream dataciteOut = new PipedOutputStream(dataciteIn)) {
                                dataciteOut.write(dataciteXml.getBytes(Charset.forName("utf-8")));
                                dataciteOut.close();
                            } catch (Exception e) {
                                logger.severe("Error creating datacite.xml: " + e.getMessage());
                                // TODO Auto-generated catch block
                                e.printStackTrace();
                                throw new RuntimeException("Error creating datacite.xml: " + e.getMessage());
                            }
                        }
                    }).start();
                    // Have seen Pipe Closed errors for other archivers when used as a workflow without this delay loop
                    int i = 0;
                    while (digestInputStream.available() <= 0 && i < 100) {
                        Thread.sleep(10);
                        i++;
                    }
                    String checksum = store.addContent(spaceName, "datacite.xml", digestInputStream, -1l, null, null, null);
                    logger.fine("Content: datacite.xml added with checksum: " + checksum);
                    String localchecksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest());
                    if (!checksum.equals(localchecksum)) {
                        logger.severe(checksum + " not equal to " + localchecksum);
                        return new Failure("Error in transferring DataCite.xml file to DuraCloud", "DuraCloud Submission Failure: incomplete metadata transfer");
                    }
                    // Store BagIt file
                    String fileName = spaceName + "v" + dv.getFriendlyVersionNumber() + ".zip";
                    // Add BagIt ZIP file
                    // Although DuraCloud uses SHA-256 internally, it's API uses MD5 to verify the
                    // transfer
                    messageDigest = MessageDigest.getInstance("MD5");
                    try (PipedInputStream in = new PipedInputStream();
                        DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest)) {
                        new Thread(new Runnable() {

                            public void run() {
                                try (PipedOutputStream out = new PipedOutputStream(in)) {
                                    // Generate bag
                                    BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml);
                                    bagger.setAuthenticationKey(token.getTokenString());
                                    bagger.generateBag(out);
                                } catch (Exception e) {
                                    logger.severe("Error creating bag: " + e.getMessage());
                                    // TODO Auto-generated catch block
                                    e.printStackTrace();
                                    throw new RuntimeException("Error creating bag: " + e.getMessage());
                                }
                            }
                        }).start();
                        i = 0;
                        while (digestInputStream.available() <= 0 && i < 100) {
                            Thread.sleep(10);
                            i++;
                        }
                        checksum = store.addContent(spaceName, fileName, digestInputStream2, -1l, null, null, null);
                        logger.fine("Content: " + fileName + " added with checksum: " + checksum);
                        localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest());
                        if (!checksum.equals(localchecksum)) {
                            logger.severe(checksum + " not equal to " + localchecksum);
                            return new Failure("Error in transferring Zip file to DuraCloud", "DuraCloud Submission Failure: incomplete archive transfer");
                        }
                    } catch (RuntimeException rte) {
                        logger.severe(rte.getMessage());
                        return new Failure("Error in generating Bag", "DuraCloud Submission Failure: archive file not created");
                    }
                    logger.fine("DuraCloud Submission step: Content Transferred");
                    // Document the location of dataset archival copy location (actually the URL
                    // where you can
                    // view it as an admin)
                    StringBuffer sb = new StringBuffer("https://");
                    sb.append(host);
                    if (!port.equals("443")) {
                        sb.append(":" + port);
                    }
                    sb.append("/duradmin/spaces/sm/");
                    sb.append(store.getStoreId());
                    sb.append("/" + spaceName + "/" + fileName);
                    dv.setArchivalCopyLocation(sb.toString());
                    logger.fine("DuraCloud Submission step complete: " + sb.toString());
                } catch (ContentStoreException | IOException e) {
                    // TODO Auto-generated catch block
                    logger.warning(e.getMessage());
                    e.printStackTrace();
                    return new Failure("Error in transferring file to DuraCloud", "DuraCloud Submission Failure: archive file not transferred");
                } catch (RuntimeException rte) {
                    logger.severe(rte.getMessage());
                    return new Failure("Error in generating datacite.xml file", "DuraCloud Submission Failure: metadata file not created");
                } catch (InterruptedException e) {
                    logger.warning(e.getLocalizedMessage());
                    e.printStackTrace();
                }
            } catch (ContentStoreException e) {
                logger.warning(e.getMessage());
                e.printStackTrace();
                String mesg = "DuraCloud Submission Failure";
                if (!(1 == dv.getVersion()) || !(0 == dv.getMinorVersionNumber())) {
                    mesg = mesg + ": Prior Version archiving not yet complete?";
                }
                return new Failure("Unable to create DuraCloud space with name: " + spaceName, mesg);
            } catch (NoSuchAlgorithmException e) {
                logger.severe("MD5 MessageDigest not available!");
            }
        } else {
            logger.warning("DuraCloud Submision Workflow aborted: Dataset locked for finalizePublication, or because file validation failed");
            return new Failure("Dataset locked");
        }
        return WorkflowStepResult.OK;
    } else {
        return new Failure("DuraCloud Submission not configured - no \":DuraCloudHost\".");
    }
}
Also used : BagGenerator(edu.harvard.iq.dataverse.util.bagit.BagGenerator) OREMap(edu.harvard.iq.dataverse.util.bagit.OREMap) PipedOutputStream(java.io.PipedOutputStream) ContentStoreManager(org.duracloud.client.ContentStoreManager) NoSuchAlgorithmException(java.security.NoSuchAlgorithmException) MessageDigest(java.security.MessageDigest) Failure(edu.harvard.iq.dataverse.workflow.step.Failure) Credential(org.duracloud.common.model.Credential) DigestInputStream(java.security.DigestInputStream) ContentStoreException(org.duracloud.error.ContentStoreException) Dataset(edu.harvard.iq.dataverse.Dataset) PipedInputStream(java.io.PipedInputStream) IOException(java.io.IOException) ContentStoreException(org.duracloud.error.ContentStoreException) IOException(java.io.IOException) NoSuchAlgorithmException(java.security.NoSuchAlgorithmException) ContentStoreManagerImpl(org.duracloud.client.ContentStoreManagerImpl) DataCitation(edu.harvard.iq.dataverse.DataCitation) ContentStore(org.duracloud.client.ContentStore)

Example 3 with BagGenerator

use of edu.harvard.iq.dataverse.util.bagit.BagGenerator in project dataverse by IQSS.

the class GoogleCloudSubmitToArchiveCommand method performArchiveSubmission.

@Override
public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map<String, String> requestedSettings) {
    logger.fine("In GoogleCloudSubmitToArchiveCommand...");
    String bucketName = requestedSettings.get(GOOGLECLOUD_BUCKET);
    String projectName = requestedSettings.get(GOOGLECLOUD_PROJECT);
    logger.fine("Project: " + projectName + " Bucket: " + bucketName);
    if (bucketName != null && projectName != null) {
        Storage storage;
        try {
            FileInputStream fis = new FileInputStream(System.getProperty("dataverse.files.directory") + System.getProperty("file.separator") + "googlecloudkey.json");
            storage = StorageOptions.newBuilder().setCredentials(ServiceAccountCredentials.fromStream(fis)).setProjectId(projectName).build().getService();
            Bucket bucket = storage.get(bucketName);
            Dataset dataset = dv.getDataset();
            if (dataset.getLockFor(Reason.finalizePublication) == null) {
                String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-').replace('.', '-').toLowerCase();
                DataCitation dc = new DataCitation(dv);
                Map<String, String> metadata = dc.getDataCiteMetadata();
                String dataciteXml = DOIDataCiteRegisterService.getMetadataFromDvObject(dv.getDataset().getGlobalId().asString(), metadata, dv.getDataset());
                String blobIdString = null;
                MessageDigest messageDigest = MessageDigest.getInstance("MD5");
                try (PipedInputStream dataciteIn = new PipedInputStream();
                    DigestInputStream digestInputStream = new DigestInputStream(dataciteIn, messageDigest)) {
                    // Add datacite.xml file
                    new Thread(new Runnable() {

                        public void run() {
                            try (PipedOutputStream dataciteOut = new PipedOutputStream(dataciteIn)) {
                                dataciteOut.write(dataciteXml.getBytes(Charset.forName("utf-8")));
                                dataciteOut.close();
                            } catch (Exception e) {
                                logger.severe("Error creating datacite.xml: " + e.getMessage());
                                // TODO Auto-generated catch block
                                e.printStackTrace();
                                throw new RuntimeException("Error creating datacite.xml: " + e.getMessage());
                            }
                        }
                    }).start();
                    // Have seen broken pipe in PostPublishDataset workflow without this delay
                    int i = 0;
                    while (digestInputStream.available() <= 0 && i < 100) {
                        Thread.sleep(10);
                        i++;
                    }
                    Blob dcXml = bucket.create(spaceName + "/datacite.v" + dv.getFriendlyVersionNumber() + ".xml", digestInputStream, "text/xml", Bucket.BlobWriteOption.doesNotExist());
                    String checksum = dcXml.getMd5ToHexString();
                    logger.fine("Content: datacite.xml added with checksum: " + checksum);
                    String localchecksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest());
                    if (!checksum.equals(localchecksum)) {
                        logger.severe(checksum + " not equal to " + localchecksum);
                        return new Failure("Error in transferring DataCite.xml file to GoogleCloud", "GoogleCloud Submission Failure: incomplete metadata transfer");
                    }
                    // Store BagIt file
                    String fileName = spaceName + ".v" + dv.getFriendlyVersionNumber() + ".zip";
                    // Add BagIt ZIP file
                    // Google uses MD5 as one way to verify the
                    // transfer
                    messageDigest = MessageDigest.getInstance("MD5");
                    try (PipedInputStream in = new PipedInputStream(100000);
                        DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest)) {
                        Thread writeThread = new Thread(new Runnable() {

                            public void run() {
                                try (PipedOutputStream out = new PipedOutputStream(in)) {
                                    // Generate bag
                                    BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml);
                                    bagger.setAuthenticationKey(token.getTokenString());
                                    bagger.generateBag(out);
                                } catch (Exception e) {
                                    logger.severe("Error creating bag: " + e.getMessage());
                                    // TODO Auto-generated catch block
                                    e.printStackTrace();
                                    try {
                                        digestInputStream2.close();
                                    } catch (Exception ex) {
                                        logger.warning(ex.getLocalizedMessage());
                                    }
                                    throw new RuntimeException("Error creating bag: " + e.getMessage());
                                }
                            }
                        });
                        writeThread.start();
                        /*
                             * The following loop handles two issues. First, with no delay, the
                             * bucket.create() call below can get started before the piped streams are set
                             * up, causing a failure (seen when triggered in a PostPublishDataset workflow).
                             * A minimal initial wait, e.g. until some bytes are available, would address
                             * this. Second, the BagGenerator class, due to it's use of parallel streaming
                             * creation of the zip file, has the characteristic that it makes a few bytes
                             * available - from setting up the directory structure for the zip file -
                             * significantly earlier than it is ready to stream file content (e.g. for
                             * thousands of files and GB of content). If, for these large datasets,
                             * bucket.create() is called as soon as bytes are available, the call can
                             * timeout before the bytes for all the zipped files are available. To manage
                             * this, the loop waits until 90K bytes are available, larger than any expected
                             * dir structure for the zip and implying that the main zipped content is
                             * available, or until the thread terminates, with all of its content written to
                             * the pipe. (Note the PipedInputStream buffer is set at 100K above - I didn't
                             * want to test whether that means that exactly 100K bytes will be available()
                             * for large datasets or not, so the test below is at 90K.)
                             * 
                             * An additional sanity check limits the wait to 2K seconds. The BagGenerator
                             * has been used to archive >120K files, 2K directories, and ~600GB files on the
                             * SEAD project (streaming content to disk rather than over an internet
                             * connection) which would take longer than 2K seconds (10+ hours) and might
                             * produce an initial set of bytes for directories > 90K. If Dataverse ever
                             * needs to support datasets of this size, the numbers here would need to be
                             * increased, and/or a change in how archives are sent to google (e.g. as
                             * multiple blobs that get aggregated) would be required.
                             */
                        i = 0;
                        while (digestInputStream2.available() <= 90000 && i < 2000 && writeThread.isAlive()) {
                            Thread.sleep(1000);
                            logger.fine("avail: " + digestInputStream2.available() + " : " + writeThread.getState().toString());
                            i++;
                        }
                        logger.fine("Bag: transfer started, i=" + i + ", avail = " + digestInputStream2.available());
                        if (i == 2000) {
                            throw new IOException("Stream not available");
                        }
                        Blob bag = bucket.create(spaceName + "/" + fileName, digestInputStream2, "application/zip", Bucket.BlobWriteOption.doesNotExist());
                        if (bag.getSize() == 0) {
                            throw new IOException("Empty Bag");
                        }
                        blobIdString = bag.getBlobId().getBucket() + "/" + bag.getBlobId().getName();
                        checksum = bag.getMd5ToHexString();
                        logger.fine("Bag: " + fileName + " added with checksum: " + checksum);
                        localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest());
                        if (!checksum.equals(localchecksum)) {
                            logger.severe(checksum + " not equal to " + localchecksum);
                            return new Failure("Error in transferring Zip file to GoogleCloud", "GoogleCloud Submission Failure: incomplete archive transfer");
                        }
                    } catch (RuntimeException rte) {
                        logger.severe("Error creating Bag during GoogleCloud archiving: " + rte.getMessage());
                        return new Failure("Error in generating Bag", "GoogleCloud Submission Failure: archive file not created");
                    }
                    logger.fine("GoogleCloud Submission step: Content Transferred");
                    // Document the location of dataset archival copy location (actually the URL
                    // where you can
                    // view it as an admin)
                    StringBuffer sb = new StringBuffer("https://console.cloud.google.com/storage/browser/");
                    sb.append(blobIdString);
                    dv.setArchivalCopyLocation(sb.toString());
                } catch (RuntimeException rte) {
                    logger.severe("Error creating datacite xml file during GoogleCloud Archiving: " + rte.getMessage());
                    return new Failure("Error in generating datacite.xml file", "GoogleCloud Submission Failure: metadata file not created");
                }
            } else {
                logger.warning("GoogleCloud Submision Workflow aborted: Dataset locked for pidRegister");
                return new Failure("Dataset locked");
            }
        } catch (Exception e) {
            logger.warning(e.getLocalizedMessage());
            e.printStackTrace();
            return new Failure("GoogleCloud Submission Failure", e.getLocalizedMessage() + ": check log for details");
        }
        return WorkflowStepResult.OK;
    } else {
        return new Failure("GoogleCloud Submission not configured - no \":GoogleCloudBucket\"  and/or \":GoogleCloudProject\".");
    }
}
Also used : Blob(com.google.cloud.storage.Blob) DigestInputStream(java.security.DigestInputStream) Dataset(edu.harvard.iq.dataverse.Dataset) BagGenerator(edu.harvard.iq.dataverse.util.bagit.BagGenerator) OREMap(edu.harvard.iq.dataverse.util.bagit.OREMap) PipedOutputStream(java.io.PipedOutputStream) PipedInputStream(java.io.PipedInputStream) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) NoSuchAlgorithmException(java.security.NoSuchAlgorithmException) Storage(com.google.cloud.storage.Storage) Bucket(com.google.cloud.storage.Bucket) DataCitation(edu.harvard.iq.dataverse.DataCitation) MessageDigest(java.security.MessageDigest) Failure(edu.harvard.iq.dataverse.workflow.step.Failure)

Aggregations

DataCitation (edu.harvard.iq.dataverse.DataCitation)3 Dataset (edu.harvard.iq.dataverse.Dataset)3 BagGenerator (edu.harvard.iq.dataverse.util.bagit.BagGenerator)3 OREMap (edu.harvard.iq.dataverse.util.bagit.OREMap)3 Failure (edu.harvard.iq.dataverse.workflow.step.Failure)3 IOException (java.io.IOException)2 PipedInputStream (java.io.PipedInputStream)2 PipedOutputStream (java.io.PipedOutputStream)2 DigestInputStream (java.security.DigestInputStream)2 MessageDigest (java.security.MessageDigest)2 NoSuchAlgorithmException (java.security.NoSuchAlgorithmException)2 Blob (com.google.cloud.storage.Blob)1 Bucket (com.google.cloud.storage.Bucket)1 Storage (com.google.cloud.storage.Storage)1 File (java.io.File)1 FileInputStream (java.io.FileInputStream)1 FileNotFoundException (java.io.FileNotFoundException)1 FileOutputStream (java.io.FileOutputStream)1 ContentStore (org.duracloud.client.ContentStore)1 ContentStoreManager (org.duracloud.client.ContentStoreManager)1