use of edu.harvard.iq.dataverse.util.bagit.BagGenerator in project dataverse by IQSS.
the class LocalSubmitToArchiveCommand method performArchiveSubmission.
@Override
public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map<String, String> requestedSettings) {
logger.fine("In LocalCloudSubmitToArchive...");
String localPath = requestedSettings.get(":BagItLocalPath");
String zipName = null;
try {
Dataset dataset = dv.getDataset();
if (dataset.getLockFor(Reason.finalizePublication) == null && dataset.getLockFor(Reason.FileValidationFailed) == null) {
String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-').replace('.', '-').toLowerCase();
DataCitation dc = new DataCitation(dv);
Map<String, String> metadata = dc.getDataCiteMetadata();
String dataciteXml = DOIDataCiteRegisterService.getMetadataFromDvObject(dv.getDataset().getGlobalId().asString(), metadata, dv.getDataset());
FileUtils.writeStringToFile(new File(localPath + "/" + spaceName + "-datacite.v" + dv.getFriendlyVersionNumber() + ".xml"), dataciteXml, StandardCharsets.UTF_8);
BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml);
bagger.setAuthenticationKey(token.getTokenString());
zipName = localPath + "/" + spaceName + "v" + dv.getFriendlyVersionNumber() + ".zip";
bagger.generateBag(new FileOutputStream(zipName + ".partial"));
File srcFile = new File(zipName + ".partial");
File destFile = new File(zipName);
if (srcFile.renameTo(destFile)) {
logger.fine("Localhost Submission step: Content Transferred");
dv.setArchivalCopyLocation("file://" + zipName);
} else {
logger.warning("Unable to move " + zipName + ".partial to " + zipName);
}
} else {
logger.warning("Localhost Submision Workflow aborted: Dataset locked for finalizePublication, or because file validation failed");
return new Failure("Dataset locked");
}
} catch (Exception e) {
logger.warning("Failed to archive " + zipName + " : " + e.getLocalizedMessage());
e.printStackTrace();
}
return WorkflowStepResult.OK;
}
use of edu.harvard.iq.dataverse.util.bagit.BagGenerator in project dataverse by IQSS.
the class DuraCloudSubmitToArchiveCommand method performArchiveSubmission.
@Override
public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map<String, String> requestedSettings) {
String port = requestedSettings.get(DURACLOUD_PORT) != null ? requestedSettings.get(DURACLOUD_PORT) : DEFAULT_PORT;
String dpnContext = requestedSettings.get(DURACLOUD_CONTEXT) != null ? requestedSettings.get(DURACLOUD_CONTEXT) : DEFAULT_CONTEXT;
String host = requestedSettings.get(DURACLOUD_HOST);
if (host != null) {
Dataset dataset = dv.getDataset();
if (dataset.getLockFor(Reason.finalizePublication) == null && dataset.getLockFor(Reason.FileValidationFailed) == null) {
// Use Duracloud client classes to login
ContentStoreManager storeManager = new ContentStoreManagerImpl(host, port, dpnContext);
Credential credential = new Credential(System.getProperty("duracloud.username"), System.getProperty("duracloud.password"));
storeManager.login(credential);
String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-').replace('.', '-').toLowerCase();
ContentStore store;
try {
/*
* If there is a failure in creating a space, it is likely that a prior version
* has not been fully processed (snapshot created, archiving completed and files
* and space deleted - currently manual operations done at the project's
* duracloud website)
*/
store = storeManager.getPrimaryContentStore();
// Create space to copy archival files to
store.createSpace(spaceName);
DataCitation dc = new DataCitation(dv);
Map<String, String> metadata = dc.getDataCiteMetadata();
String dataciteXml = DOIDataCiteRegisterService.getMetadataFromDvObject(dv.getDataset().getGlobalId().asString(), metadata, dv.getDataset());
MessageDigest messageDigest = MessageDigest.getInstance("MD5");
try (PipedInputStream dataciteIn = new PipedInputStream();
DigestInputStream digestInputStream = new DigestInputStream(dataciteIn, messageDigest)) {
// Add datacite.xml file
new Thread(new Runnable() {
public void run() {
try (PipedOutputStream dataciteOut = new PipedOutputStream(dataciteIn)) {
dataciteOut.write(dataciteXml.getBytes(Charset.forName("utf-8")));
dataciteOut.close();
} catch (Exception e) {
logger.severe("Error creating datacite.xml: " + e.getMessage());
// TODO Auto-generated catch block
e.printStackTrace();
throw new RuntimeException("Error creating datacite.xml: " + e.getMessage());
}
}
}).start();
// Have seen Pipe Closed errors for other archivers when used as a workflow without this delay loop
int i = 0;
while (digestInputStream.available() <= 0 && i < 100) {
Thread.sleep(10);
i++;
}
String checksum = store.addContent(spaceName, "datacite.xml", digestInputStream, -1l, null, null, null);
logger.fine("Content: datacite.xml added with checksum: " + checksum);
String localchecksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest());
if (!checksum.equals(localchecksum)) {
logger.severe(checksum + " not equal to " + localchecksum);
return new Failure("Error in transferring DataCite.xml file to DuraCloud", "DuraCloud Submission Failure: incomplete metadata transfer");
}
// Store BagIt file
String fileName = spaceName + "v" + dv.getFriendlyVersionNumber() + ".zip";
// Add BagIt ZIP file
// Although DuraCloud uses SHA-256 internally, it's API uses MD5 to verify the
// transfer
messageDigest = MessageDigest.getInstance("MD5");
try (PipedInputStream in = new PipedInputStream();
DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest)) {
new Thread(new Runnable() {
public void run() {
try (PipedOutputStream out = new PipedOutputStream(in)) {
// Generate bag
BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml);
bagger.setAuthenticationKey(token.getTokenString());
bagger.generateBag(out);
} catch (Exception e) {
logger.severe("Error creating bag: " + e.getMessage());
// TODO Auto-generated catch block
e.printStackTrace();
throw new RuntimeException("Error creating bag: " + e.getMessage());
}
}
}).start();
i = 0;
while (digestInputStream.available() <= 0 && i < 100) {
Thread.sleep(10);
i++;
}
checksum = store.addContent(spaceName, fileName, digestInputStream2, -1l, null, null, null);
logger.fine("Content: " + fileName + " added with checksum: " + checksum);
localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest());
if (!checksum.equals(localchecksum)) {
logger.severe(checksum + " not equal to " + localchecksum);
return new Failure("Error in transferring Zip file to DuraCloud", "DuraCloud Submission Failure: incomplete archive transfer");
}
} catch (RuntimeException rte) {
logger.severe(rte.getMessage());
return new Failure("Error in generating Bag", "DuraCloud Submission Failure: archive file not created");
}
logger.fine("DuraCloud Submission step: Content Transferred");
// Document the location of dataset archival copy location (actually the URL
// where you can
// view it as an admin)
StringBuffer sb = new StringBuffer("https://");
sb.append(host);
if (!port.equals("443")) {
sb.append(":" + port);
}
sb.append("/duradmin/spaces/sm/");
sb.append(store.getStoreId());
sb.append("/" + spaceName + "/" + fileName);
dv.setArchivalCopyLocation(sb.toString());
logger.fine("DuraCloud Submission step complete: " + sb.toString());
} catch (ContentStoreException | IOException e) {
// TODO Auto-generated catch block
logger.warning(e.getMessage());
e.printStackTrace();
return new Failure("Error in transferring file to DuraCloud", "DuraCloud Submission Failure: archive file not transferred");
} catch (RuntimeException rte) {
logger.severe(rte.getMessage());
return new Failure("Error in generating datacite.xml file", "DuraCloud Submission Failure: metadata file not created");
} catch (InterruptedException e) {
logger.warning(e.getLocalizedMessage());
e.printStackTrace();
}
} catch (ContentStoreException e) {
logger.warning(e.getMessage());
e.printStackTrace();
String mesg = "DuraCloud Submission Failure";
if (!(1 == dv.getVersion()) || !(0 == dv.getMinorVersionNumber())) {
mesg = mesg + ": Prior Version archiving not yet complete?";
}
return new Failure("Unable to create DuraCloud space with name: " + spaceName, mesg);
} catch (NoSuchAlgorithmException e) {
logger.severe("MD5 MessageDigest not available!");
}
} else {
logger.warning("DuraCloud Submision Workflow aborted: Dataset locked for finalizePublication, or because file validation failed");
return new Failure("Dataset locked");
}
return WorkflowStepResult.OK;
} else {
return new Failure("DuraCloud Submission not configured - no \":DuraCloudHost\".");
}
}
use of edu.harvard.iq.dataverse.util.bagit.BagGenerator in project dataverse by IQSS.
the class GoogleCloudSubmitToArchiveCommand method performArchiveSubmission.
@Override
public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map<String, String> requestedSettings) {
logger.fine("In GoogleCloudSubmitToArchiveCommand...");
String bucketName = requestedSettings.get(GOOGLECLOUD_BUCKET);
String projectName = requestedSettings.get(GOOGLECLOUD_PROJECT);
logger.fine("Project: " + projectName + " Bucket: " + bucketName);
if (bucketName != null && projectName != null) {
Storage storage;
try {
FileInputStream fis = new FileInputStream(System.getProperty("dataverse.files.directory") + System.getProperty("file.separator") + "googlecloudkey.json");
storage = StorageOptions.newBuilder().setCredentials(ServiceAccountCredentials.fromStream(fis)).setProjectId(projectName).build().getService();
Bucket bucket = storage.get(bucketName);
Dataset dataset = dv.getDataset();
if (dataset.getLockFor(Reason.finalizePublication) == null) {
String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-').replace('.', '-').toLowerCase();
DataCitation dc = new DataCitation(dv);
Map<String, String> metadata = dc.getDataCiteMetadata();
String dataciteXml = DOIDataCiteRegisterService.getMetadataFromDvObject(dv.getDataset().getGlobalId().asString(), metadata, dv.getDataset());
String blobIdString = null;
MessageDigest messageDigest = MessageDigest.getInstance("MD5");
try (PipedInputStream dataciteIn = new PipedInputStream();
DigestInputStream digestInputStream = new DigestInputStream(dataciteIn, messageDigest)) {
// Add datacite.xml file
new Thread(new Runnable() {
public void run() {
try (PipedOutputStream dataciteOut = new PipedOutputStream(dataciteIn)) {
dataciteOut.write(dataciteXml.getBytes(Charset.forName("utf-8")));
dataciteOut.close();
} catch (Exception e) {
logger.severe("Error creating datacite.xml: " + e.getMessage());
// TODO Auto-generated catch block
e.printStackTrace();
throw new RuntimeException("Error creating datacite.xml: " + e.getMessage());
}
}
}).start();
// Have seen broken pipe in PostPublishDataset workflow without this delay
int i = 0;
while (digestInputStream.available() <= 0 && i < 100) {
Thread.sleep(10);
i++;
}
Blob dcXml = bucket.create(spaceName + "/datacite.v" + dv.getFriendlyVersionNumber() + ".xml", digestInputStream, "text/xml", Bucket.BlobWriteOption.doesNotExist());
String checksum = dcXml.getMd5ToHexString();
logger.fine("Content: datacite.xml added with checksum: " + checksum);
String localchecksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest());
if (!checksum.equals(localchecksum)) {
logger.severe(checksum + " not equal to " + localchecksum);
return new Failure("Error in transferring DataCite.xml file to GoogleCloud", "GoogleCloud Submission Failure: incomplete metadata transfer");
}
// Store BagIt file
String fileName = spaceName + ".v" + dv.getFriendlyVersionNumber() + ".zip";
// Add BagIt ZIP file
// Google uses MD5 as one way to verify the
// transfer
messageDigest = MessageDigest.getInstance("MD5");
try (PipedInputStream in = new PipedInputStream(100000);
DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest)) {
Thread writeThread = new Thread(new Runnable() {
public void run() {
try (PipedOutputStream out = new PipedOutputStream(in)) {
// Generate bag
BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml);
bagger.setAuthenticationKey(token.getTokenString());
bagger.generateBag(out);
} catch (Exception e) {
logger.severe("Error creating bag: " + e.getMessage());
// TODO Auto-generated catch block
e.printStackTrace();
try {
digestInputStream2.close();
} catch (Exception ex) {
logger.warning(ex.getLocalizedMessage());
}
throw new RuntimeException("Error creating bag: " + e.getMessage());
}
}
});
writeThread.start();
/*
* The following loop handles two issues. First, with no delay, the
* bucket.create() call below can get started before the piped streams are set
* up, causing a failure (seen when triggered in a PostPublishDataset workflow).
* A minimal initial wait, e.g. until some bytes are available, would address
* this. Second, the BagGenerator class, due to it's use of parallel streaming
* creation of the zip file, has the characteristic that it makes a few bytes
* available - from setting up the directory structure for the zip file -
* significantly earlier than it is ready to stream file content (e.g. for
* thousands of files and GB of content). If, for these large datasets,
* bucket.create() is called as soon as bytes are available, the call can
* timeout before the bytes for all the zipped files are available. To manage
* this, the loop waits until 90K bytes are available, larger than any expected
* dir structure for the zip and implying that the main zipped content is
* available, or until the thread terminates, with all of its content written to
* the pipe. (Note the PipedInputStream buffer is set at 100K above - I didn't
* want to test whether that means that exactly 100K bytes will be available()
* for large datasets or not, so the test below is at 90K.)
*
* An additional sanity check limits the wait to 2K seconds. The BagGenerator
* has been used to archive >120K files, 2K directories, and ~600GB files on the
* SEAD project (streaming content to disk rather than over an internet
* connection) which would take longer than 2K seconds (10+ hours) and might
* produce an initial set of bytes for directories > 90K. If Dataverse ever
* needs to support datasets of this size, the numbers here would need to be
* increased, and/or a change in how archives are sent to google (e.g. as
* multiple blobs that get aggregated) would be required.
*/
i = 0;
while (digestInputStream2.available() <= 90000 && i < 2000 && writeThread.isAlive()) {
Thread.sleep(1000);
logger.fine("avail: " + digestInputStream2.available() + " : " + writeThread.getState().toString());
i++;
}
logger.fine("Bag: transfer started, i=" + i + ", avail = " + digestInputStream2.available());
if (i == 2000) {
throw new IOException("Stream not available");
}
Blob bag = bucket.create(spaceName + "/" + fileName, digestInputStream2, "application/zip", Bucket.BlobWriteOption.doesNotExist());
if (bag.getSize() == 0) {
throw new IOException("Empty Bag");
}
blobIdString = bag.getBlobId().getBucket() + "/" + bag.getBlobId().getName();
checksum = bag.getMd5ToHexString();
logger.fine("Bag: " + fileName + " added with checksum: " + checksum);
localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest());
if (!checksum.equals(localchecksum)) {
logger.severe(checksum + " not equal to " + localchecksum);
return new Failure("Error in transferring Zip file to GoogleCloud", "GoogleCloud Submission Failure: incomplete archive transfer");
}
} catch (RuntimeException rte) {
logger.severe("Error creating Bag during GoogleCloud archiving: " + rte.getMessage());
return new Failure("Error in generating Bag", "GoogleCloud Submission Failure: archive file not created");
}
logger.fine("GoogleCloud Submission step: Content Transferred");
// Document the location of dataset archival copy location (actually the URL
// where you can
// view it as an admin)
StringBuffer sb = new StringBuffer("https://console.cloud.google.com/storage/browser/");
sb.append(blobIdString);
dv.setArchivalCopyLocation(sb.toString());
} catch (RuntimeException rte) {
logger.severe("Error creating datacite xml file during GoogleCloud Archiving: " + rte.getMessage());
return new Failure("Error in generating datacite.xml file", "GoogleCloud Submission Failure: metadata file not created");
}
} else {
logger.warning("GoogleCloud Submision Workflow aborted: Dataset locked for pidRegister");
return new Failure("Dataset locked");
}
} catch (Exception e) {
logger.warning(e.getLocalizedMessage());
e.printStackTrace();
return new Failure("GoogleCloud Submission Failure", e.getLocalizedMessage() + ": check log for details");
}
return WorkflowStepResult.OK;
} else {
return new Failure("GoogleCloud Submission not configured - no \":GoogleCloudBucket\" and/or \":GoogleCloudProject\".");
}
}
Aggregations