Search in sources :

Example 6 with DatasetDTO

use of edu.harvard.iq.dataverse.api.dto.DatasetDTO in project dataverse by IQSS.

the class ImportServiceBean method doImportHarvestedDataset.

@TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, HarvestingClient harvestingClient, String harvestIdentifier, String metadataFormat, File metadataFile, PrintWriter cleanupLog) throws ImportException, IOException {
    if (harvestingClient == null || harvestingClient.getDataverse() == null) {
        throw new ImportException("importHarvestedDataset called wiht a null harvestingClient, or an invalid harvestingClient.");
    }
    Dataverse owner = harvestingClient.getDataverse();
    Dataset importedDataset = null;
    DatasetDTO dsDTO = null;
    String json = null;
    if ("ddi".equalsIgnoreCase(metadataFormat) || "oai_ddi".equals(metadataFormat) || metadataFormat.toLowerCase().matches("^oai_ddi.*")) {
        try {
            String xmlToParse = new String(Files.readAllBytes(metadataFile.toPath()));
            // TODO:
            // import type should be configurable - it should be possible to
            // select whether you want to harvest with or without files,
            // ImportType.HARVEST vs. ImportType.HARVEST_WITH_FILES
            logger.fine("importing DDI " + metadataFile.getAbsolutePath());
            dsDTO = importDDIService.doImport(ImportType.HARVEST_WITH_FILES, xmlToParse);
        } catch (IOException | XMLStreamException | ImportException e) {
            throw new ImportException("Failed to process DDI XML record: " + e.getClass() + " (" + e.getMessage() + ")");
        }
    } else if ("dc".equalsIgnoreCase(metadataFormat) || "oai_dc".equals(metadataFormat)) {
        logger.fine("importing DC " + metadataFile.getAbsolutePath());
        try {
            String xmlToParse = new String(Files.readAllBytes(metadataFile.toPath()));
            dsDTO = importGenericService.processOAIDCxml(xmlToParse);
        } catch (IOException | XMLStreamException e) {
            throw new ImportException("Failed to process Dublin Core XML record: " + e.getClass() + " (" + e.getMessage() + ")");
        }
    } else if ("dataverse_json".equals(metadataFormat)) {
        // This is Dataverse metadata already formatted in JSON.
        // Simply read it into a string, and pass to the final import further down:
        logger.fine("Attempting to import custom dataverse metadata from file " + metadataFile.getAbsolutePath());
        json = new String(Files.readAllBytes(metadataFile.toPath()));
    } else {
        throw new ImportException("Unsupported import metadata format: " + metadataFormat);
    }
    if (json == null) {
        if (dsDTO != null) {
            // convert DTO to Json,
            Gson gson = new GsonBuilder().setPrettyPrinting().create();
            json = gson.toJson(dsDTO);
            logger.fine("JSON produced for the metadata harvested: " + json);
        } else {
            throw new ImportException("Failed to transform XML metadata format " + metadataFormat + " into a DatasetDTO");
        }
    }
    JsonReader jsonReader = Json.createReader(new StringReader(json));
    JsonObject obj = jsonReader.readObject();
    // and call parse Json to read it into a dataset
    try {
        JsonParser parser = new JsonParser(datasetfieldService, metadataBlockService, settingsService);
        parser.setLenient(true);
        Dataset ds = parser.parseDataset(obj);
        // For ImportType.NEW, if the metadata contains a global identifier, and it's not a protocol
        // we support, it should be rejected.
        // (TODO: ! - add some way of keeping track of supported protocols!)
        // if (ds.getGlobalId() != null && !ds.getProtocol().equals(settingsService.getValueForKey(SettingsServiceBean.Key.Protocol, ""))) {
        // throw new ImportException("Could not register id " + ds.getGlobalId() + ", protocol not supported");
        // }
        ds.setOwner(owner);
        ds.getLatestVersion().setDatasetFields(ds.getLatestVersion().initDatasetFields());
        // Check data against required contraints
        List<ConstraintViolation<DatasetField>> violations = ds.getVersions().get(0).validateRequired();
        if (!violations.isEmpty()) {
            // For migration and harvest, add NA for missing required values
            for (ConstraintViolation<DatasetField> v : violations) {
                DatasetField f = v.getRootBean();
                f.setSingleValue(DatasetField.NA_VALUE);
            }
        }
        // Check data against validation constraints
        // If we are migrating and "scrub migration data" is true we attempt to fix invalid data
        // if the fix fails stop processing of this file by throwing exception
        Set<ConstraintViolation> invalidViolations = ds.getVersions().get(0).validate();
        ValidatorFactory factory = Validation.buildDefaultValidatorFactory();
        Validator validator = factory.getValidator();
        if (!invalidViolations.isEmpty()) {
            for (ConstraintViolation<DatasetFieldValue> v : invalidViolations) {
                DatasetFieldValue f = v.getRootBean();
                boolean fixed = false;
                boolean converted = false;
                // TODO: Is this scrubbing something we want to continue doing?
                if (settingsService.isTrueForKey(SettingsServiceBean.Key.ScrubMigrationData, false)) {
                    fixed = processMigrationValidationError(f, cleanupLog, metadataFile.getName());
                    converted = true;
                    if (fixed) {
                        Set<ConstraintViolation<DatasetFieldValue>> scrubbedViolations = validator.validate(f);
                        if (!scrubbedViolations.isEmpty()) {
                            fixed = false;
                        }
                    }
                }
                if (!fixed) {
                    String msg = "Data modified - File: " + metadataFile.getName() + "; Field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; " + "Invalid value:  '" + f.getValue() + "'" + " Converted Value:'" + DatasetField.NA_VALUE + "'";
                    cleanupLog.println(msg);
                    f.setValue(DatasetField.NA_VALUE);
                }
            }
        }
        // this dataset:
        if (StringUtils.isEmpty(ds.getGlobalId())) {
            throw new ImportException("The harvested metadata record with the OAI server identifier " + harvestIdentifier + " does not contain a global unique identifier that we could recognize, skipping.");
        }
        ds.setHarvestedFrom(harvestingClient);
        ds.setHarvestIdentifier(harvestIdentifier);
        Dataset existingDs = datasetService.findByGlobalId(ds.getGlobalId());
        if (existingDs != null) {
            // we are just going to skip it!
            if (existingDs.getOwner() != null && !owner.getId().equals(existingDs.getOwner().getId())) {
                throw new ImportException("The dataset with the global id " + ds.getGlobalId() + " already exists, in the dataverse " + existingDs.getOwner().getAlias() + ", skipping.");
            }
            // skip it also:
            if (!existingDs.isHarvested()) {
                throw new ImportException("A LOCAL dataset with the global id " + ds.getGlobalId() + " already exists in this dataverse; skipping.");
            }
            // We will replace the current version with the imported version.
            if (existingDs.getVersions().size() != 1) {
                throw new ImportException("Error importing Harvested Dataset, existing dataset has " + existingDs.getVersions().size() + " versions");
            }
            // Purge all the SOLR documents associated with this client from the
            // index server:
            indexService.deleteHarvestedDocuments(existingDs);
            // DeleteFileCommand on them.
            for (DataFile harvestedFile : existingDs.getFiles()) {
                DataFile merged = em.merge(harvestedFile);
                em.remove(merged);
                harvestedFile = null;
            }
            // TODO:
            // Verify what happens with the indexed files in SOLR?
            // are they going to be overwritten by the reindexing of the dataset?
            existingDs.setFiles(null);
            Dataset merged = em.merge(existingDs);
            engineSvc.submit(new DestroyDatasetCommand(merged, dataverseRequest));
            importedDataset = engineSvc.submit(new CreateDatasetCommand(ds, dataverseRequest, false, ImportType.HARVEST));
        } else {
            importedDataset = engineSvc.submit(new CreateDatasetCommand(ds, dataverseRequest, false, ImportType.HARVEST));
        }
    } catch (JsonParseException | ImportException | CommandException ex) {
        logger.fine("Failed to import harvested dataset: " + ex.getClass() + ": " + ex.getMessage());
        FileOutputStream savedJsonFileStream = new FileOutputStream(new File(metadataFile.getAbsolutePath() + ".json"));
        byte[] jsonBytes = json.getBytes();
        int i = 0;
        while (i < jsonBytes.length) {
            int chunkSize = i + 8192 <= jsonBytes.length ? 8192 : jsonBytes.length - i;
            savedJsonFileStream.write(jsonBytes, i, chunkSize);
            i += chunkSize;
            savedJsonFileStream.flush();
        }
        savedJsonFileStream.close();
        logger.info("JSON produced saved in " + metadataFile.getAbsolutePath() + ".json");
        throw new ImportException("Failed to import harvested dataset: " + ex.getClass() + " (" + ex.getMessage() + ")", ex);
    }
    return importedDataset;
}
Also used : DatasetField(edu.harvard.iq.dataverse.DatasetField) CreateDatasetCommand(edu.harvard.iq.dataverse.engine.command.impl.CreateDatasetCommand) Gson(com.google.gson.Gson) JsonObject(javax.json.JsonObject) JsonParseException(edu.harvard.iq.dataverse.util.json.JsonParseException) DatasetDTO(edu.harvard.iq.dataverse.api.dto.DatasetDTO) DataFile(edu.harvard.iq.dataverse.DataFile) DatasetFieldValue(edu.harvard.iq.dataverse.DatasetFieldValue) StringReader(java.io.StringReader) JsonReader(javax.json.JsonReader) JsonParser(edu.harvard.iq.dataverse.util.json.JsonParser) ValidatorFactory(javax.validation.ValidatorFactory) GsonBuilder(com.google.gson.GsonBuilder) Dataset(edu.harvard.iq.dataverse.Dataset) IOException(java.io.IOException) CommandException(edu.harvard.iq.dataverse.engine.command.exception.CommandException) Dataverse(edu.harvard.iq.dataverse.Dataverse) XMLStreamException(javax.xml.stream.XMLStreamException) ConstraintViolation(javax.validation.ConstraintViolation) FileOutputStream(java.io.FileOutputStream) DataFile(edu.harvard.iq.dataverse.DataFile) File(java.io.File) Validator(javax.validation.Validator) DestroyDatasetCommand(edu.harvard.iq.dataverse.engine.command.impl.DestroyDatasetCommand) TransactionAttribute(javax.ejb.TransactionAttribute)

Aggregations

Gson (com.google.gson.Gson)6 DatasetDTO (edu.harvard.iq.dataverse.api.dto.DatasetDTO)6 XMLStreamException (javax.xml.stream.XMLStreamException)3 GsonBuilder (com.google.gson.GsonBuilder)2 Dataset (edu.harvard.iq.dataverse.Dataset)2 DatasetField (edu.harvard.iq.dataverse.DatasetField)2 DatasetFieldValue (edu.harvard.iq.dataverse.DatasetFieldValue)2 CommandException (edu.harvard.iq.dataverse.engine.command.exception.CommandException)2 CreateDatasetCommand (edu.harvard.iq.dataverse.engine.command.impl.CreateDatasetCommand)2 DestroyDatasetCommand (edu.harvard.iq.dataverse.engine.command.impl.DestroyDatasetCommand)2 JsonParseException (edu.harvard.iq.dataverse.util.json.JsonParseException)2 JsonParser (edu.harvard.iq.dataverse.util.json.JsonParser)2 StringReader (java.io.StringReader)2 JsonObject (javax.json.JsonObject)2 JsonReader (javax.json.JsonReader)2 ConstraintViolation (javax.validation.ConstraintViolation)2 Validator (javax.validation.Validator)2 ValidatorFactory (javax.validation.ValidatorFactory)2 DataFile (edu.harvard.iq.dataverse.DataFile)1 DatasetVersion (edu.harvard.iq.dataverse.DatasetVersion)1