use of edu.harvard.iq.dataverse.DataCitation in project dataverse by IQSS.
the class LocalSubmitToArchiveCommand method performArchiveSubmission.
@Override
public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map<String, String> requestedSettings) {
logger.fine("In LocalCloudSubmitToArchive...");
String localPath = requestedSettings.get(":BagItLocalPath");
String zipName = null;
try {
Dataset dataset = dv.getDataset();
if (dataset.getLockFor(Reason.finalizePublication) == null && dataset.getLockFor(Reason.FileValidationFailed) == null) {
String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-').replace('.', '-').toLowerCase();
DataCitation dc = new DataCitation(dv);
Map<String, String> metadata = dc.getDataCiteMetadata();
String dataciteXml = DOIDataCiteRegisterService.getMetadataFromDvObject(dv.getDataset().getGlobalId().asString(), metadata, dv.getDataset());
FileUtils.writeStringToFile(new File(localPath + "/" + spaceName + "-datacite.v" + dv.getFriendlyVersionNumber() + ".xml"), dataciteXml, StandardCharsets.UTF_8);
BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml);
bagger.setAuthenticationKey(token.getTokenString());
zipName = localPath + "/" + spaceName + "v" + dv.getFriendlyVersionNumber() + ".zip";
bagger.generateBag(new FileOutputStream(zipName + ".partial"));
File srcFile = new File(zipName + ".partial");
File destFile = new File(zipName);
if (srcFile.renameTo(destFile)) {
logger.fine("Localhost Submission step: Content Transferred");
dv.setArchivalCopyLocation("file://" + zipName);
} else {
logger.warning("Unable to move " + zipName + ".partial to " + zipName);
}
} else {
logger.warning("Localhost Submision Workflow aborted: Dataset locked for finalizePublication, or because file validation failed");
return new Failure("Dataset locked");
}
} catch (Exception e) {
logger.warning("Failed to archive " + zipName + " : " + e.getLocalizedMessage());
e.printStackTrace();
}
return WorkflowStepResult.OK;
}
use of edu.harvard.iq.dataverse.DataCitation in project dataverse by IQSS.
the class DuraCloudSubmitToArchiveCommand method performArchiveSubmission.
@Override
public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map<String, String> requestedSettings) {
String port = requestedSettings.get(DURACLOUD_PORT) != null ? requestedSettings.get(DURACLOUD_PORT) : DEFAULT_PORT;
String dpnContext = requestedSettings.get(DURACLOUD_CONTEXT) != null ? requestedSettings.get(DURACLOUD_CONTEXT) : DEFAULT_CONTEXT;
String host = requestedSettings.get(DURACLOUD_HOST);
if (host != null) {
Dataset dataset = dv.getDataset();
if (dataset.getLockFor(Reason.finalizePublication) == null && dataset.getLockFor(Reason.FileValidationFailed) == null) {
// Use Duracloud client classes to login
ContentStoreManager storeManager = new ContentStoreManagerImpl(host, port, dpnContext);
Credential credential = new Credential(System.getProperty("duracloud.username"), System.getProperty("duracloud.password"));
storeManager.login(credential);
String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-').replace('.', '-').toLowerCase();
ContentStore store;
try {
/*
* If there is a failure in creating a space, it is likely that a prior version
* has not been fully processed (snapshot created, archiving completed and files
* and space deleted - currently manual operations done at the project's
* duracloud website)
*/
store = storeManager.getPrimaryContentStore();
// Create space to copy archival files to
store.createSpace(spaceName);
DataCitation dc = new DataCitation(dv);
Map<String, String> metadata = dc.getDataCiteMetadata();
String dataciteXml = DOIDataCiteRegisterService.getMetadataFromDvObject(dv.getDataset().getGlobalId().asString(), metadata, dv.getDataset());
MessageDigest messageDigest = MessageDigest.getInstance("MD5");
try (PipedInputStream dataciteIn = new PipedInputStream();
DigestInputStream digestInputStream = new DigestInputStream(dataciteIn, messageDigest)) {
// Add datacite.xml file
new Thread(new Runnable() {
public void run() {
try (PipedOutputStream dataciteOut = new PipedOutputStream(dataciteIn)) {
dataciteOut.write(dataciteXml.getBytes(Charset.forName("utf-8")));
dataciteOut.close();
} catch (Exception e) {
logger.severe("Error creating datacite.xml: " + e.getMessage());
// TODO Auto-generated catch block
e.printStackTrace();
throw new RuntimeException("Error creating datacite.xml: " + e.getMessage());
}
}
}).start();
// Have seen Pipe Closed errors for other archivers when used as a workflow without this delay loop
int i = 0;
while (digestInputStream.available() <= 0 && i < 100) {
Thread.sleep(10);
i++;
}
String checksum = store.addContent(spaceName, "datacite.xml", digestInputStream, -1l, null, null, null);
logger.fine("Content: datacite.xml added with checksum: " + checksum);
String localchecksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest());
if (!checksum.equals(localchecksum)) {
logger.severe(checksum + " not equal to " + localchecksum);
return new Failure("Error in transferring DataCite.xml file to DuraCloud", "DuraCloud Submission Failure: incomplete metadata transfer");
}
// Store BagIt file
String fileName = spaceName + "v" + dv.getFriendlyVersionNumber() + ".zip";
// Add BagIt ZIP file
// Although DuraCloud uses SHA-256 internally, it's API uses MD5 to verify the
// transfer
messageDigest = MessageDigest.getInstance("MD5");
try (PipedInputStream in = new PipedInputStream();
DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest)) {
new Thread(new Runnable() {
public void run() {
try (PipedOutputStream out = new PipedOutputStream(in)) {
// Generate bag
BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml);
bagger.setAuthenticationKey(token.getTokenString());
bagger.generateBag(out);
} catch (Exception e) {
logger.severe("Error creating bag: " + e.getMessage());
// TODO Auto-generated catch block
e.printStackTrace();
throw new RuntimeException("Error creating bag: " + e.getMessage());
}
}
}).start();
i = 0;
while (digestInputStream.available() <= 0 && i < 100) {
Thread.sleep(10);
i++;
}
checksum = store.addContent(spaceName, fileName, digestInputStream2, -1l, null, null, null);
logger.fine("Content: " + fileName + " added with checksum: " + checksum);
localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest());
if (!checksum.equals(localchecksum)) {
logger.severe(checksum + " not equal to " + localchecksum);
return new Failure("Error in transferring Zip file to DuraCloud", "DuraCloud Submission Failure: incomplete archive transfer");
}
} catch (RuntimeException rte) {
logger.severe(rte.getMessage());
return new Failure("Error in generating Bag", "DuraCloud Submission Failure: archive file not created");
}
logger.fine("DuraCloud Submission step: Content Transferred");
// Document the location of dataset archival copy location (actually the URL
// where you can
// view it as an admin)
StringBuffer sb = new StringBuffer("https://");
sb.append(host);
if (!port.equals("443")) {
sb.append(":" + port);
}
sb.append("/duradmin/spaces/sm/");
sb.append(store.getStoreId());
sb.append("/" + spaceName + "/" + fileName);
dv.setArchivalCopyLocation(sb.toString());
logger.fine("DuraCloud Submission step complete: " + sb.toString());
} catch (ContentStoreException | IOException e) {
// TODO Auto-generated catch block
logger.warning(e.getMessage());
e.printStackTrace();
return new Failure("Error in transferring file to DuraCloud", "DuraCloud Submission Failure: archive file not transferred");
} catch (RuntimeException rte) {
logger.severe(rte.getMessage());
return new Failure("Error in generating datacite.xml file", "DuraCloud Submission Failure: metadata file not created");
} catch (InterruptedException e) {
logger.warning(e.getLocalizedMessage());
e.printStackTrace();
}
} catch (ContentStoreException e) {
logger.warning(e.getMessage());
e.printStackTrace();
String mesg = "DuraCloud Submission Failure";
if (!(1 == dv.getVersion()) || !(0 == dv.getMinorVersionNumber())) {
mesg = mesg + ": Prior Version archiving not yet complete?";
}
return new Failure("Unable to create DuraCloud space with name: " + spaceName, mesg);
} catch (NoSuchAlgorithmException e) {
logger.severe("MD5 MessageDigest not available!");
}
} else {
logger.warning("DuraCloud Submision Workflow aborted: Dataset locked for finalizePublication, or because file validation failed");
return new Failure("Dataset locked");
}
return WorkflowStepResult.OK;
} else {
return new Failure("DuraCloud Submission not configured - no \":DuraCloudHost\".");
}
}
use of edu.harvard.iq.dataverse.DataCitation in project dataverse by IQSS.
the class GoogleCloudSubmitToArchiveCommand method performArchiveSubmission.
@Override
public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map<String, String> requestedSettings) {
logger.fine("In GoogleCloudSubmitToArchiveCommand...");
String bucketName = requestedSettings.get(GOOGLECLOUD_BUCKET);
String projectName = requestedSettings.get(GOOGLECLOUD_PROJECT);
logger.fine("Project: " + projectName + " Bucket: " + bucketName);
if (bucketName != null && projectName != null) {
Storage storage;
try {
FileInputStream fis = new FileInputStream(System.getProperty("dataverse.files.directory") + System.getProperty("file.separator") + "googlecloudkey.json");
storage = StorageOptions.newBuilder().setCredentials(ServiceAccountCredentials.fromStream(fis)).setProjectId(projectName).build().getService();
Bucket bucket = storage.get(bucketName);
Dataset dataset = dv.getDataset();
if (dataset.getLockFor(Reason.finalizePublication) == null) {
String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-').replace('.', '-').toLowerCase();
DataCitation dc = new DataCitation(dv);
Map<String, String> metadata = dc.getDataCiteMetadata();
String dataciteXml = DOIDataCiteRegisterService.getMetadataFromDvObject(dv.getDataset().getGlobalId().asString(), metadata, dv.getDataset());
String blobIdString = null;
MessageDigest messageDigest = MessageDigest.getInstance("MD5");
try (PipedInputStream dataciteIn = new PipedInputStream();
DigestInputStream digestInputStream = new DigestInputStream(dataciteIn, messageDigest)) {
// Add datacite.xml file
new Thread(new Runnable() {
public void run() {
try (PipedOutputStream dataciteOut = new PipedOutputStream(dataciteIn)) {
dataciteOut.write(dataciteXml.getBytes(Charset.forName("utf-8")));
dataciteOut.close();
} catch (Exception e) {
logger.severe("Error creating datacite.xml: " + e.getMessage());
// TODO Auto-generated catch block
e.printStackTrace();
throw new RuntimeException("Error creating datacite.xml: " + e.getMessage());
}
}
}).start();
// Have seen broken pipe in PostPublishDataset workflow without this delay
int i = 0;
while (digestInputStream.available() <= 0 && i < 100) {
Thread.sleep(10);
i++;
}
Blob dcXml = bucket.create(spaceName + "/datacite.v" + dv.getFriendlyVersionNumber() + ".xml", digestInputStream, "text/xml", Bucket.BlobWriteOption.doesNotExist());
String checksum = dcXml.getMd5ToHexString();
logger.fine("Content: datacite.xml added with checksum: " + checksum);
String localchecksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest());
if (!checksum.equals(localchecksum)) {
logger.severe(checksum + " not equal to " + localchecksum);
return new Failure("Error in transferring DataCite.xml file to GoogleCloud", "GoogleCloud Submission Failure: incomplete metadata transfer");
}
// Store BagIt file
String fileName = spaceName + ".v" + dv.getFriendlyVersionNumber() + ".zip";
// Add BagIt ZIP file
// Google uses MD5 as one way to verify the
// transfer
messageDigest = MessageDigest.getInstance("MD5");
try (PipedInputStream in = new PipedInputStream(100000);
DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest)) {
Thread writeThread = new Thread(new Runnable() {
public void run() {
try (PipedOutputStream out = new PipedOutputStream(in)) {
// Generate bag
BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml);
bagger.setAuthenticationKey(token.getTokenString());
bagger.generateBag(out);
} catch (Exception e) {
logger.severe("Error creating bag: " + e.getMessage());
// TODO Auto-generated catch block
e.printStackTrace();
try {
digestInputStream2.close();
} catch (Exception ex) {
logger.warning(ex.getLocalizedMessage());
}
throw new RuntimeException("Error creating bag: " + e.getMessage());
}
}
});
writeThread.start();
/*
* The following loop handles two issues. First, with no delay, the
* bucket.create() call below can get started before the piped streams are set
* up, causing a failure (seen when triggered in a PostPublishDataset workflow).
* A minimal initial wait, e.g. until some bytes are available, would address
* this. Second, the BagGenerator class, due to it's use of parallel streaming
* creation of the zip file, has the characteristic that it makes a few bytes
* available - from setting up the directory structure for the zip file -
* significantly earlier than it is ready to stream file content (e.g. for
* thousands of files and GB of content). If, for these large datasets,
* bucket.create() is called as soon as bytes are available, the call can
* timeout before the bytes for all the zipped files are available. To manage
* this, the loop waits until 90K bytes are available, larger than any expected
* dir structure for the zip and implying that the main zipped content is
* available, or until the thread terminates, with all of its content written to
* the pipe. (Note the PipedInputStream buffer is set at 100K above - I didn't
* want to test whether that means that exactly 100K bytes will be available()
* for large datasets or not, so the test below is at 90K.)
*
* An additional sanity check limits the wait to 2K seconds. The BagGenerator
* has been used to archive >120K files, 2K directories, and ~600GB files on the
* SEAD project (streaming content to disk rather than over an internet
* connection) which would take longer than 2K seconds (10+ hours) and might
* produce an initial set of bytes for directories > 90K. If Dataverse ever
* needs to support datasets of this size, the numbers here would need to be
* increased, and/or a change in how archives are sent to google (e.g. as
* multiple blobs that get aggregated) would be required.
*/
i = 0;
while (digestInputStream2.available() <= 90000 && i < 2000 && writeThread.isAlive()) {
Thread.sleep(1000);
logger.fine("avail: " + digestInputStream2.available() + " : " + writeThread.getState().toString());
i++;
}
logger.fine("Bag: transfer started, i=" + i + ", avail = " + digestInputStream2.available());
if (i == 2000) {
throw new IOException("Stream not available");
}
Blob bag = bucket.create(spaceName + "/" + fileName, digestInputStream2, "application/zip", Bucket.BlobWriteOption.doesNotExist());
if (bag.getSize() == 0) {
throw new IOException("Empty Bag");
}
blobIdString = bag.getBlobId().getBucket() + "/" + bag.getBlobId().getName();
checksum = bag.getMd5ToHexString();
logger.fine("Bag: " + fileName + " added with checksum: " + checksum);
localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest());
if (!checksum.equals(localchecksum)) {
logger.severe(checksum + " not equal to " + localchecksum);
return new Failure("Error in transferring Zip file to GoogleCloud", "GoogleCloud Submission Failure: incomplete archive transfer");
}
} catch (RuntimeException rte) {
logger.severe("Error creating Bag during GoogleCloud archiving: " + rte.getMessage());
return new Failure("Error in generating Bag", "GoogleCloud Submission Failure: archive file not created");
}
logger.fine("GoogleCloud Submission step: Content Transferred");
// Document the location of dataset archival copy location (actually the URL
// where you can
// view it as an admin)
StringBuffer sb = new StringBuffer("https://console.cloud.google.com/storage/browser/");
sb.append(blobIdString);
dv.setArchivalCopyLocation(sb.toString());
} catch (RuntimeException rte) {
logger.severe("Error creating datacite xml file during GoogleCloud Archiving: " + rte.getMessage());
return new Failure("Error in generating datacite.xml file", "GoogleCloud Submission Failure: metadata file not created");
}
} else {
logger.warning("GoogleCloud Submision Workflow aborted: Dataset locked for pidRegister");
return new Failure("Dataset locked");
}
} catch (Exception e) {
logger.warning(e.getLocalizedMessage());
e.printStackTrace();
return new Failure("GoogleCloud Submission Failure", e.getLocalizedMessage() + ": check log for details");
}
return WorkflowStepResult.OK;
} else {
return new Failure("GoogleCloud Submission not configured - no \":GoogleCloudBucket\" and/or \":GoogleCloudProject\".");
}
}
use of edu.harvard.iq.dataverse.DataCitation in project dataverse by IQSS.
the class Access method datafileBundle.
// @EJB
// TODO:
// versions? -- L.A. 4.0 beta 10
@Path("datafile/bundle/{fileId}")
@GET
@Produces({ "application/zip" })
public BundleDownloadInstance datafileBundle(@PathParam("fileId") String fileId, @QueryParam("fileMetadataId") Long fileMetadataId, @QueryParam("gbrecs") boolean gbrecs, @QueryParam("key") String apiToken, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) /*throws NotFoundException, ServiceUnavailableException, PermissionDeniedException, AuthorizationRequiredException*/
{
GuestbookResponse gbr = null;
DataFile df = findDataFileOrDieWrapper(fileId);
if (apiToken == null || apiToken.equals("")) {
apiToken = headers.getHeaderString(API_KEY_HEADER);
}
// This will throw a ForbiddenException if access isn't authorized:
checkAuthorization(df, apiToken);
if (gbrecs != true && df.isReleased()) {
// Write Guestbook record if not done previously and file is released
User apiTokenUser = findAPITokenUser(apiToken);
gbr = guestbookResponseService.initAPIGuestbookResponse(df.getOwner(), df, session, apiTokenUser);
guestbookResponseService.save(gbr);
MakeDataCountEntry entry = new MakeDataCountEntry(uriInfo, headers, dvRequestService, df);
mdcLogService.logEntry(entry);
}
DownloadInfo dInfo = new DownloadInfo(df);
BundleDownloadInstance downloadInstance = new BundleDownloadInstance(dInfo);
FileMetadata fileMetadata = null;
if (fileMetadataId == null) {
fileMetadata = df.getFileMetadata();
} else {
fileMetadata = dataFileService.findFileMetadata(fileMetadataId);
}
downloadInstance.setFileCitationEndNote(new DataCitation(fileMetadata).toEndNoteString());
downloadInstance.setFileCitationRIS(new DataCitation(fileMetadata).toRISString());
downloadInstance.setFileCitationBibtex(new DataCitation(fileMetadata).toBibtexString());
ByteArrayOutputStream outStream = null;
outStream = new ByteArrayOutputStream();
Long dfId = df.getId();
try {
ddiExportService.exportDataFile(dfId, outStream, null, null, fileMetadataId);
downloadInstance.setFileDDIXML(outStream.toString());
} catch (Exception ex) {
// if we can't generate the DDI, it's ok;
// we'll just generate the bundle without it.
}
return downloadInstance;
}
use of edu.harvard.iq.dataverse.DataCitation in project dataverse by IQSS.
the class DataCiteExporter method exportDataset.
@Override
public void exportDataset(DatasetVersion version, JsonObject json, OutputStream outputStream) throws ExportException {
try {
DataCitation dc = new DataCitation(version);
Map<String, String> metadata = dc.getDataCiteMetadata();
String xml = DOIDataCiteRegisterService.getMetadataFromDvObject(version.getDataset().getGlobalId().asString(), metadata, version.getDataset());
outputStream.write(xml.getBytes(Charset.forName("utf-8")));
} catch (IOException e) {
throw new ExportException("Caught IOException performing DataCite export");
}
}
Aggregations