use of edu.harvard.iq.dataverse.DataFile in project dataverse by IQSS.
the class ImportServiceBean method doImportHarvestedDataset.
@TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, HarvestingClient harvestingClient, String harvestIdentifier, String metadataFormat, File metadataFile, PrintWriter cleanupLog) throws ImportException, IOException {
if (harvestingClient == null || harvestingClient.getDataverse() == null) {
throw new ImportException("importHarvestedDataset called wiht a null harvestingClient, or an invalid harvestingClient.");
}
Dataverse owner = harvestingClient.getDataverse();
Dataset importedDataset = null;
DatasetDTO dsDTO = null;
String json = null;
if ("ddi".equalsIgnoreCase(metadataFormat) || "oai_ddi".equals(metadataFormat) || metadataFormat.toLowerCase().matches("^oai_ddi.*")) {
try {
String xmlToParse = new String(Files.readAllBytes(metadataFile.toPath()));
// TODO:
// import type should be configurable - it should be possible to
// select whether you want to harvest with or without files,
// ImportType.HARVEST vs. ImportType.HARVEST_WITH_FILES
logger.fine("importing DDI " + metadataFile.getAbsolutePath());
dsDTO = importDDIService.doImport(ImportType.HARVEST_WITH_FILES, xmlToParse);
} catch (IOException | XMLStreamException | ImportException e) {
throw new ImportException("Failed to process DDI XML record: " + e.getClass() + " (" + e.getMessage() + ")");
}
} else if ("dc".equalsIgnoreCase(metadataFormat) || "oai_dc".equals(metadataFormat)) {
logger.fine("importing DC " + metadataFile.getAbsolutePath());
try {
String xmlToParse = new String(Files.readAllBytes(metadataFile.toPath()));
dsDTO = importGenericService.processOAIDCxml(xmlToParse);
} catch (IOException | XMLStreamException e) {
throw new ImportException("Failed to process Dublin Core XML record: " + e.getClass() + " (" + e.getMessage() + ")");
}
} else if ("dataverse_json".equals(metadataFormat)) {
// This is Dataverse metadata already formatted in JSON.
// Simply read it into a string, and pass to the final import further down:
logger.fine("Attempting to import custom dataverse metadata from file " + metadataFile.getAbsolutePath());
json = new String(Files.readAllBytes(metadataFile.toPath()));
} else {
throw new ImportException("Unsupported import metadata format: " + metadataFormat);
}
if (json == null) {
if (dsDTO != null) {
// convert DTO to Json,
Gson gson = new GsonBuilder().setPrettyPrinting().create();
json = gson.toJson(dsDTO);
logger.fine("JSON produced for the metadata harvested: " + json);
} else {
throw new ImportException("Failed to transform XML metadata format " + metadataFormat + " into a DatasetDTO");
}
}
JsonReader jsonReader = Json.createReader(new StringReader(json));
JsonObject obj = jsonReader.readObject();
// and call parse Json to read it into a dataset
try {
JsonParser parser = new JsonParser(datasetfieldService, metadataBlockService, settingsService);
parser.setLenient(true);
Dataset ds = parser.parseDataset(obj);
// For ImportType.NEW, if the metadata contains a global identifier, and it's not a protocol
// we support, it should be rejected.
// (TODO: ! - add some way of keeping track of supported protocols!)
// if (ds.getGlobalId() != null && !ds.getProtocol().equals(settingsService.getValueForKey(SettingsServiceBean.Key.Protocol, ""))) {
// throw new ImportException("Could not register id " + ds.getGlobalId() + ", protocol not supported");
// }
ds.setOwner(owner);
ds.getLatestVersion().setDatasetFields(ds.getLatestVersion().initDatasetFields());
// Check data against required contraints
List<ConstraintViolation<DatasetField>> violations = ds.getVersions().get(0).validateRequired();
if (!violations.isEmpty()) {
// For migration and harvest, add NA for missing required values
for (ConstraintViolation<DatasetField> v : violations) {
DatasetField f = v.getRootBean();
f.setSingleValue(DatasetField.NA_VALUE);
}
}
// Check data against validation constraints
// If we are migrating and "scrub migration data" is true we attempt to fix invalid data
// if the fix fails stop processing of this file by throwing exception
Set<ConstraintViolation> invalidViolations = ds.getVersions().get(0).validate();
ValidatorFactory factory = Validation.buildDefaultValidatorFactory();
Validator validator = factory.getValidator();
if (!invalidViolations.isEmpty()) {
for (ConstraintViolation<DatasetFieldValue> v : invalidViolations) {
DatasetFieldValue f = v.getRootBean();
boolean fixed = false;
boolean converted = false;
// TODO: Is this scrubbing something we want to continue doing?
if (settingsService.isTrueForKey(SettingsServiceBean.Key.ScrubMigrationData, false)) {
fixed = processMigrationValidationError(f, cleanupLog, metadataFile.getName());
converted = true;
if (fixed) {
Set<ConstraintViolation<DatasetFieldValue>> scrubbedViolations = validator.validate(f);
if (!scrubbedViolations.isEmpty()) {
fixed = false;
}
}
}
if (!fixed) {
String msg = "Data modified - File: " + metadataFile.getName() + "; Field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; " + "Invalid value: '" + f.getValue() + "'" + " Converted Value:'" + DatasetField.NA_VALUE + "'";
cleanupLog.println(msg);
f.setValue(DatasetField.NA_VALUE);
}
}
}
// this dataset:
if (StringUtils.isEmpty(ds.getGlobalId())) {
throw new ImportException("The harvested metadata record with the OAI server identifier " + harvestIdentifier + " does not contain a global unique identifier that we could recognize, skipping.");
}
ds.setHarvestedFrom(harvestingClient);
ds.setHarvestIdentifier(harvestIdentifier);
Dataset existingDs = datasetService.findByGlobalId(ds.getGlobalId());
if (existingDs != null) {
// we are just going to skip it!
if (existingDs.getOwner() != null && !owner.getId().equals(existingDs.getOwner().getId())) {
throw new ImportException("The dataset with the global id " + ds.getGlobalId() + " already exists, in the dataverse " + existingDs.getOwner().getAlias() + ", skipping.");
}
// skip it also:
if (!existingDs.isHarvested()) {
throw new ImportException("A LOCAL dataset with the global id " + ds.getGlobalId() + " already exists in this dataverse; skipping.");
}
// We will replace the current version with the imported version.
if (existingDs.getVersions().size() != 1) {
throw new ImportException("Error importing Harvested Dataset, existing dataset has " + existingDs.getVersions().size() + " versions");
}
// Purge all the SOLR documents associated with this client from the
// index server:
indexService.deleteHarvestedDocuments(existingDs);
// DeleteFileCommand on them.
for (DataFile harvestedFile : existingDs.getFiles()) {
DataFile merged = em.merge(harvestedFile);
em.remove(merged);
harvestedFile = null;
}
// TODO:
// Verify what happens with the indexed files in SOLR?
// are they going to be overwritten by the reindexing of the dataset?
existingDs.setFiles(null);
Dataset merged = em.merge(existingDs);
engineSvc.submit(new DestroyDatasetCommand(merged, dataverseRequest));
importedDataset = engineSvc.submit(new CreateDatasetCommand(ds, dataverseRequest, false, ImportType.HARVEST));
} else {
importedDataset = engineSvc.submit(new CreateDatasetCommand(ds, dataverseRequest, false, ImportType.HARVEST));
}
} catch (JsonParseException | ImportException | CommandException ex) {
logger.fine("Failed to import harvested dataset: " + ex.getClass() + ": " + ex.getMessage());
FileOutputStream savedJsonFileStream = new FileOutputStream(new File(metadataFile.getAbsolutePath() + ".json"));
byte[] jsonBytes = json.getBytes();
int i = 0;
while (i < jsonBytes.length) {
int chunkSize = i + 8192 <= jsonBytes.length ? 8192 : jsonBytes.length - i;
savedJsonFileStream.write(jsonBytes, i, chunkSize);
i += chunkSize;
savedJsonFileStream.flush();
}
savedJsonFileStream.close();
logger.info("JSON produced saved in " + metadataFile.getAbsolutePath() + ".json");
throw new ImportException("Failed to import harvested dataset: " + ex.getClass() + " (" + ex.getMessage() + ")", ex);
}
return importedDataset;
}
use of edu.harvard.iq.dataverse.DataFile in project dataverse by IQSS.
the class TestIngest method datafile.
// @EJB
@Path("test/file")
@GET
@Produces({ "text/plain" })
public String datafile(@QueryParam("fileName") String fileName, @QueryParam("fileType") String fileType, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) /*throws NotFoundException, ServiceUnavailableException, PermissionDeniedException, AuthorizationRequiredException*/
{
String output = "";
if (StringUtil.isEmpty(fileName) || StringUtil.isEmpty(fileType)) {
output = output.concat("Usage: /api/ingest/test/file?fileName=PATH&fileType=TYPE");
return output;
}
BufferedInputStream fileInputStream = null;
try {
fileInputStream = new BufferedInputStream(new FileInputStream(new File(fileName)));
} catch (FileNotFoundException notfoundEx) {
fileInputStream = null;
}
if (fileInputStream == null) {
output = output.concat("Could not open file " + fileName + ".");
return output;
}
TabularDataFileReader ingestPlugin = ingestService.getTabDataReaderByMimeType(fileType);
if (ingestPlugin == null) {
output = output.concat("Could not locate an ingest plugin for type " + fileType + ".");
return output;
}
TabularDataIngest tabDataIngest = null;
try {
tabDataIngest = ingestPlugin.read(fileInputStream, null);
} catch (IOException ingestEx) {
output = output.concat("Caught an exception trying to ingest file " + fileName + ".");
return output;
}
try {
if (tabDataIngest != null) {
File tabFile = tabDataIngest.getTabDelimitedFile();
if (tabDataIngest.getDataTable() != null && tabFile != null && tabFile.exists()) {
String tabFilename = FileUtil.replaceExtension(fileName, "tab");
java.nio.file.Files.copy(Paths.get(tabFile.getAbsolutePath()), Paths.get(tabFilename), StandardCopyOption.REPLACE_EXISTING);
DataTable dataTable = tabDataIngest.getDataTable();
DataFile dataFile = new DataFile();
dataFile.setStorageIdentifier(tabFilename);
FileMetadata fileMetadata = new FileMetadata();
fileMetadata.setLabel(fileName);
dataFile.setDataTable(dataTable);
dataTable.setDataFile(dataFile);
fileMetadata.setDataFile(dataFile);
dataFile.getFileMetadatas().add(fileMetadata);
output = output.concat("NVARS: " + dataTable.getVarQuantity() + "\n");
output = output.concat("NOBS: " + dataTable.getCaseQuantity() + "\n");
try {
ingestService.produceSummaryStatistics(dataFile, tabFile);
output = output.concat("UNF: " + dataTable.getUnf() + "\n");
} catch (IOException ioex) {
output = output.concat("UNF: failed to calculate\n" + "\n");
}
for (int i = 0; i < dataTable.getVarQuantity(); i++) {
String vartype = "";
// if ("continuous".equals(dataTable.getDataVariables().get(i).getVariableIntervalType().getName())) {
if (dataTable.getDataVariables().get(i).isIntervalContinuous()) {
vartype = "numeric-continuous";
} else {
if (dataTable.getDataVariables().get(i).isTypeNumeric()) {
vartype = "numeric-discrete";
} else {
String formatCategory = dataTable.getDataVariables().get(i).getFormatCategory();
if ("time".equals(formatCategory)) {
vartype = "character-time";
} else if ("date".equals(formatCategory)) {
vartype = "character-date";
} else {
vartype = "character";
}
}
}
output = output.concat("VAR" + i + " ");
output = output.concat(dataTable.getDataVariables().get(i).getName() + " ");
output = output.concat(vartype + " ");
output = output.concat(dataTable.getDataVariables().get(i).getUnf());
output = output.concat("\n");
}
} else {
output = output.concat("Ingest failed to produce tab file or data table for file " + fileName + ".");
return output;
}
} else {
output = output.concat("Ingest resulted in a null tabDataIngest object for file " + fileName + ".");
return output;
}
} catch (IOException ex) {
output = output.concat("Caught an exception trying to save ingested data for file " + fileName + ".");
return output;
}
return output;
}
use of edu.harvard.iq.dataverse.DataFile in project dataverse by IQSS.
the class ExternalTools method getExternalToolsByFile.
@GET
@Path("file/{id}")
public Response getExternalToolsByFile(@PathParam("id") Long fileIdFromUser) {
DataFile dataFile = fileSvc.find(fileIdFromUser);
if (dataFile == null) {
return error(BAD_REQUEST, "Could not find datafile with id " + fileIdFromUser);
}
JsonArrayBuilder tools = Json.createArrayBuilder();
List<ExternalTool> allExternalTools = externalToolService.findAll();
List<ExternalTool> toolsByFile = ExternalToolServiceBean.findExternalToolsByFile(allExternalTools, dataFile);
for (ExternalTool tool : toolsByFile) {
tools.add(tool.toJson());
}
return ok(tools);
}
use of edu.harvard.iq.dataverse.DataFile in project dataverse by IQSS.
the class Index method indexTypeById.
@GET
@Path("{type}/{id}")
public Response indexTypeById(@PathParam("type") String type, @PathParam("id") Long id) {
try {
if (type.equals("dataverses")) {
Dataverse dataverse = dataverseService.find(id);
if (dataverse != null) {
/**
* @todo Can we display the result of indexing to the user?
*/
Future<String> indexDataverseFuture = indexService.indexDataverse(dataverse);
return ok("starting reindex of dataverse " + id);
} else {
String response = indexService.removeSolrDocFromIndex(IndexServiceBean.solrDocIdentifierDataverse + id);
return notFound("Could not find dataverse with id of " + id + ". Result from deletion attempt: " + response);
}
} else if (type.equals("datasets")) {
Dataset dataset = datasetService.find(id);
if (dataset != null) {
boolean doNormalSolrDocCleanUp = true;
Future<String> indexDatasetFuture = indexService.indexDataset(dataset, doNormalSolrDocCleanUp);
return ok("starting reindex of dataset " + id);
} else {
/**
* @todo what about published, deaccessioned, etc.? Need
* method to target those, not just drafts!
*/
String response = indexService.removeSolrDocFromIndex(IndexServiceBean.solrDocIdentifierDataset + id + IndexServiceBean.draftSuffix);
return notFound("Could not find dataset with id of " + id + ". Result from deletion attempt: " + response);
}
} else if (type.equals("files")) {
DataFile dataFile = dataFileService.find(id);
Dataset datasetThatOwnsTheFile = datasetService.find(dataFile.getOwner().getId());
/**
* @todo How can we display the result to the user?
*/
boolean doNormalSolrDocCleanUp = true;
Future<String> indexDatasetFuture = indexService.indexDataset(datasetThatOwnsTheFile, doNormalSolrDocCleanUp);
return ok("started reindexing " + type + "/" + id);
} else {
return error(Status.BAD_REQUEST, "illegal type: " + type);
}
} catch (EJBException ex) {
Throwable cause = ex;
StringBuilder sb = new StringBuilder();
sb.append("Problem indexing ").append(type).append("/").append(id).append(": ");
sb.append(ex).append(" ");
while (cause.getCause() != null) {
cause = cause.getCause();
sb.append(cause.getClass().getCanonicalName()).append(" ");
sb.append(cause.getMessage()).append(" ");
if (cause instanceof ConstraintViolationException) {
ConstraintViolationException constraintViolationException = (ConstraintViolationException) cause;
for (ConstraintViolation<?> violation : constraintViolationException.getConstraintViolations()) {
sb.append("(invalid value: <<<").append(violation.getInvalidValue()).append(">>> for ").append(violation.getPropertyPath()).append(" at ").append(violation.getLeafBean()).append(" - ").append(violation.getMessage()).append(")");
}
} else if (cause instanceof NullPointerException) {
for (int i = 0; i < 2; i++) {
StackTraceElement stacktrace = cause.getStackTrace()[i];
if (stacktrace != null) {
String classCanonicalName = stacktrace.getClass().getCanonicalName();
String methodName = stacktrace.getMethodName();
int lineNumber = stacktrace.getLineNumber();
String error = "at " + stacktrace.getClassName() + "." + stacktrace.getMethodName() + "(" + stacktrace.getFileName() + ":" + lineNumber + ") ";
sb.append(error);
}
}
}
}
return error(Status.INTERNAL_SERVER_ERROR, sb.toString());
}
}
use of edu.harvard.iq.dataverse.DataFile in project dataverse by IQSS.
the class BundleDownloadInstanceWriter method writeTo.
@Override
public void writeTo(BundleDownloadInstance di, Class<?> clazz, Type type, Annotation[] annotation, MediaType mediaType, MultivaluedMap<String, Object> httpHeaders, OutputStream outstream) throws IOException, WebApplicationException {
try {
if (di.getDownloadInfo() != null && di.getDownloadInfo().getDataFile() != null) {
DataAccessRequest daReq = new DataAccessRequest();
DataFile sf = di.getDownloadInfo().getDataFile();
StorageIO<DataFile> accessObject = DataAccess.getStorageIO(sf, daReq);
if (accessObject != null) {
accessObject.open();
ZipOutputStream zout = new ZipOutputStream(outstream);
/* First, the tab file itself: */
String fileName = accessObject.getFileName();
String zipFileName = fileName.replaceAll("\\.tab$", "-bundle.zip");
httpHeaders.add("Content-disposition", "attachment; filename=\"" + zipFileName + "\"");
httpHeaders.add("Content-Type", "application/zip; name=\"" + zipFileName + "\"");
InputStream instream = accessObject.getInputStream();
ZipEntry e = new ZipEntry(fileName);
zout.putNextEntry(e);
String varHeaderLine = accessObject.getVarHeader();
if (varHeaderLine != null) {
zout.write(varHeaderLine.getBytes());
}
byte[] data = new byte[8192];
int i = 0;
while ((i = instream.read(data)) > 0) {
zout.write(data, 0, i);
zout.flush();
}
instream.close();
zout.closeEntry();
instream = null;
// Now, the original format:
String origFormat = null;
try {
// .retrieve(sf, (FileAccessIO) accessObject);
StorageIO<DataFile> accessObjectOrig = StoredOriginalFile.retreive(accessObject);
if (accessObjectOrig != null) {
instream = accessObjectOrig.getInputStream();
if (instream != null) {
String origFileName = accessObjectOrig.getFileName();
origFormat = accessObject.getMimeType();
e = new ZipEntry(origFileName);
zout.putNextEntry(e);
i = 0;
while ((i = instream.read(data)) > 0) {
zout.write(data, 0, i);
zout.flush();
}
}
}
} catch (IOException ioex) {
// ignore; if for whatever reason the original is not
// available, we'll just skip it.
logger.warning("failed to retrieve saved original for " + fileName);
} finally {
if (instream != null) {
try {
instream.close();
} catch (IOException ioex) {
}
try {
zout.closeEntry();
} catch (IOException ioex) {
}
}
}
instream = null;
// add an RData version:
if (!"application/x-rlang-transport".equals(origFormat)) {
try {
StorageIO<DataFile> accessObjectRdata = DataConverter.performFormatConversion(sf, accessObject, "RData", "application/x-rlang-transport");
if (accessObjectRdata != null) {
instream = accessObjectRdata.getInputStream();
if (instream != null) {
String rdataFileName = accessObjectRdata.getFileName();
e = new ZipEntry(rdataFileName);
zout.putNextEntry(e);
i = 0;
while ((i = instream.read(data)) > 0) {
zout.write(data, 0, i);
zout.flush();
}
}
}
} catch (IOException ioex) {
// ignore; if for whatever reason RData conversion is not
// available, we'll just skip it.
logger.warning("failed to convert tabular data file " + fileName + " to RData.");
} finally {
if (instream != null) {
try {
instream.close();
} catch (IOException ioex) {
}
try {
zout.closeEntry();
} catch (IOException ioex) {
}
}
}
}
// And the variable metadata (DDI/XML), if available:
if (di.getFileDDIXML() != null) {
e = new ZipEntry(fileName.replaceAll("\\.tab$", "-ddi.xml"));
zout.putNextEntry(e);
zout.write(di.getFileDDIXML().getBytes());
zout.closeEntry();
}
// And now the citations:
if (di.getFileCitationEndNote() != null) {
e = new ZipEntry(fileName.replaceAll("\\.tab$", "citation-endnote.xml"));
zout.putNextEntry(e);
zout.write(di.getFileCitationEndNote().getBytes());
zout.closeEntry();
}
if (di.getFileCitationRIS() != null) {
e = new ZipEntry(fileName.replaceAll("\\.tab$", "citation-ris.ris"));
zout.putNextEntry(e);
zout.write(di.getFileCitationRIS().getBytes());
zout.closeEntry();
}
if (di.getFileCitationBibtex() != null) {
e = new ZipEntry(fileName.replaceAll("\\.tab$", "citation-bib.bib"));
zout.putNextEntry(e);
zout.write(di.getFileCitationBibtex().getBytes());
zout.closeEntry();
}
zout.close();
return;
}
}
} catch (IOException ioex) {
throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR);
}
throw new WebApplicationException(Response.Status.NOT_FOUND);
}
Aggregations