use of edu.harvard.iq.dataverse.DatasetField in project dataverse by IQSS.
the class JsonParser method parseMetadataBlocks.
public List<DatasetField> parseMetadataBlocks(JsonObject json) throws JsonParseException {
Set<String> keys = json.keySet();
List<DatasetField> fields = new LinkedList<>();
for (String blockName : keys) {
JsonObject blockJson = json.getJsonObject(blockName);
JsonArray fieldsJson = blockJson.getJsonArray("fields");
for (JsonObject fieldJson : fieldsJson.getValuesAs(JsonObject.class)) {
try {
fields.add(parseField(fieldJson));
} catch (CompoundVocabularyException ex) {
DatasetFieldType fieldType = datasetFieldSvc.findByNameOpt(fieldJson.getString("typeName", ""));
if (lenient && (DatasetFieldConstant.geographicCoverage).equals(fieldType.getName())) {
fields.add(remapGeographicCoverage(ex));
} else {
// if not lenient mode, re-throw exception
throw ex;
}
}
}
}
convertKeywordsToSubjects(fields);
return fields;
}
use of edu.harvard.iq.dataverse.DatasetField in project dataverse by IQSS.
the class JsonParser method convertKeywordsToSubjects.
/**
* Special processing of keywords and subjects. All keywords and subjects will be input
* from foreign formats (DDI, dcterms, etc) as keywords.
* As part of the parsing, we will move keywords that match subject controlled vocabulary values
* into the subjects datasetField.
* @param fields - the parsed datasetFields
*/
public void convertKeywordsToSubjects(List<DatasetField> fields) {
DatasetField keywordField = null;
for (DatasetField field : fields) {
if (field.getDatasetFieldType().getName().equals("keyword")) {
keywordField = field;
break;
}
}
if (keywordField == null) {
// nothing to do.
return;
}
DatasetFieldType type = datasetFieldSvc.findByNameOpt(DatasetFieldConstant.subject);
// new list to hold subjects that we find
List<ControlledVocabularyValue> subjects = new ArrayList<>();
// Make new list to hold the non-subject keywords
List<DatasetFieldCompoundValue> filteredValues = new ArrayList<>();
for (DatasetFieldCompoundValue compoundVal : keywordField.getDatasetFieldCompoundValues()) {
// Loop through the child fields to find the "keywordValue" field
for (DatasetField childField : compoundVal.getChildDatasetFields()) {
if (childField.getDatasetFieldType().getName().equals(DatasetFieldConstant.keywordValue)) {
// check if this value is a subject
ControlledVocabularyValue cvv = datasetFieldSvc.findControlledVocabularyValueByDatasetFieldTypeAndStrValue(type, childField.getValue(), lenient);
if (cvv == null) {
// the keyword was not found in the subject list, so retain it in filtered list
filteredValues.add(compoundVal);
} else {
// save the value for our subject field
if (!subjects.contains(cvv)) {
subjects.add(cvv);
}
}
}
}
}
// if we have found any subjects in the keyword list, then update the keyword and subject fields appropriately.
if (subjects.size() > 0) {
keywordField.setDatasetFieldCompoundValues(filteredValues);
DatasetField subjectField = new DatasetField();
subjectField.setDatasetFieldType(type);
for (ControlledVocabularyValue val : subjects) {
int order = 0;
val.setDisplayOrder(order);
val.setDatasetFieldType(type);
order++;
}
subjectField.setControlledVocabularyValues(subjects);
fields.add(subjectField);
}
}
use of edu.harvard.iq.dataverse.DatasetField in project dataverse by IQSS.
the class IngestServiceBean method processDatasetMetadata.
private void processDatasetMetadata(FileMetadataIngest fileMetadataIngest, DatasetVersion editVersion) throws IOException {
for (MetadataBlock mdb : editVersion.getDataset().getOwner().getMetadataBlocks()) {
if (mdb.getName().equals(fileMetadataIngest.getMetadataBlockName())) {
logger.fine("Ingest Service: dataset version has " + mdb.getName() + " metadata block enabled.");
editVersion.setDatasetFields(editVersion.initDatasetFields());
Map<String, Set<String>> fileMetadataMap = fileMetadataIngest.getMetadataMap();
for (DatasetFieldType dsft : mdb.getDatasetFieldTypes()) {
if (dsft.isPrimitive()) {
if (!dsft.isHasParent()) {
String dsfName = dsft.getName();
// See if the plugin has found anything for this field:
if (fileMetadataMap.get(dsfName) != null && !fileMetadataMap.get(dsfName).isEmpty()) {
logger.fine("Ingest Service: found extracted metadata for field " + dsfName);
// go through the existing fields:
for (DatasetField dsf : editVersion.getFlatDatasetFields()) {
if (dsf.getDatasetFieldType().equals(dsft)) {
// yep, this is our field!
// let's go through the values that the ingest
// plugin found in the file for this field:
Set<String> mValues = fileMetadataMap.get(dsfName);
// programmatically defined. -- L.A. 4.0
if (dsfName.equals("resolution.Temporal") || dsfName.equals("resolution.Spatial") || dsfName.equals("resolution.Spectral")) {
// For these values, we aggregate the minimum-maximum
// pair, for the entire set.
// So first, we need to go through the values found by
// the plugin and select the min. and max. values of
// these:
// (note that we are assuming that they all must
// validate as doubles!)
Double minValue = null;
Double maxValue = null;
for (String fValue : mValues) {
try {
double thisValue = Double.parseDouble(fValue);
if (minValue == null || Double.compare(thisValue, minValue) < 0) {
minValue = thisValue;
}
if (maxValue == null || Double.compare(thisValue, maxValue) > 0) {
maxValue = thisValue;
}
} catch (NumberFormatException e) {
}
}
// logger.fine("Min value: "+minValue+", Max value: "+maxValue);
if (minValue != null && maxValue != null) {
Double storedMinValue = null;
Double storedMaxValue = null;
String storedValue = "";
if (dsf.getDatasetFieldValues() != null && dsf.getDatasetFieldValues().get(0) != null) {
storedValue = dsf.getDatasetFieldValues().get(0).getValue();
if (storedValue != null && !storedValue.equals("")) {
try {
if (storedValue.indexOf(" - ") > -1) {
storedMinValue = Double.parseDouble(storedValue.substring(0, storedValue.indexOf(" - ")));
storedMaxValue = Double.parseDouble(storedValue.substring(storedValue.indexOf(" - ") + 3));
} else {
storedMinValue = Double.parseDouble(storedValue);
storedMaxValue = storedMinValue;
}
if (storedMinValue != null && storedMinValue.compareTo(minValue) < 0) {
minValue = storedMinValue;
}
if (storedMaxValue != null && storedMaxValue.compareTo(maxValue) > 0) {
maxValue = storedMaxValue;
}
} catch (NumberFormatException e) {
}
} else {
storedValue = "";
}
}
// logger.fine("Stored min value: "+storedMinValue+", Stored max value: "+storedMaxValue);
String newAggregateValue = "";
if (minValue.equals(maxValue)) {
newAggregateValue = minValue.toString();
} else {
newAggregateValue = minValue.toString() + " - " + maxValue.toString();
}
// finally, compare it to the value we have now:
if (!storedValue.equals(newAggregateValue)) {
if (dsf.getDatasetFieldValues() == null) {
dsf.setDatasetFieldValues(new ArrayList<DatasetFieldValue>());
}
if (dsf.getDatasetFieldValues().get(0) == null) {
DatasetFieldValue newDsfv = new DatasetFieldValue(dsf);
dsf.getDatasetFieldValues().add(newDsfv);
}
dsf.getDatasetFieldValues().get(0).setValue(newAggregateValue);
}
}
// Ouch.
} else {
for (String fValue : mValues) {
if (!dsft.isControlledVocabulary()) {
Iterator<DatasetFieldValue> dsfvIt = dsf.getDatasetFieldValues().iterator();
boolean valueExists = false;
while (dsfvIt.hasNext()) {
DatasetFieldValue dsfv = dsfvIt.next();
if (fValue.equals(dsfv.getValue())) {
logger.fine("Value " + fValue + " already exists for field " + dsfName);
valueExists = true;
break;
}
}
if (!valueExists) {
logger.fine("Creating a new value for field " + dsfName + ": " + fValue);
DatasetFieldValue newDsfv = new DatasetFieldValue(dsf);
newDsfv.setValue(fValue);
dsf.getDatasetFieldValues().add(newDsfv);
}
} else {
// A controlled vocabulary entry:
// first, let's see if it's a legit control vocab. entry:
ControlledVocabularyValue legitControlledVocabularyValue = null;
Collection<ControlledVocabularyValue> definedVocabularyValues = dsft.getControlledVocabularyValues();
if (definedVocabularyValues != null) {
for (ControlledVocabularyValue definedVocabValue : definedVocabularyValues) {
if (fValue.equals(definedVocabValue.getStrValue())) {
logger.fine("Yes, " + fValue + " is a valid controlled vocabulary value for the field " + dsfName);
legitControlledVocabularyValue = definedVocabValue;
break;
}
}
}
if (legitControlledVocabularyValue != null) {
// Only need to add the value if it is new,
// i.e. if it does not exist yet:
boolean valueExists = false;
List<ControlledVocabularyValue> existingControlledVocabValues = dsf.getControlledVocabularyValues();
if (existingControlledVocabValues != null) {
Iterator<ControlledVocabularyValue> cvvIt = existingControlledVocabValues.iterator();
while (cvvIt.hasNext()) {
ControlledVocabularyValue cvv = cvvIt.next();
if (fValue.equals(cvv.getStrValue())) {
// or should I use if (legitControlledVocabularyValue.equals(cvv)) ?
logger.fine("Controlled vocab. value " + fValue + " already exists for field " + dsfName);
valueExists = true;
break;
}
}
}
if (!valueExists) {
logger.fine("Adding controlled vocabulary value " + fValue + " to field " + dsfName);
dsf.getControlledVocabularyValues().add(legitControlledVocabularyValue);
}
}
}
}
}
}
}
}
}
} else {
// A compound field:
// See if the plugin has found anything for the fields that
// make up this compound field; if we find at least one
// of the child values in the map of extracted values, we'll
// create a new compound field value and its child
//
DatasetFieldCompoundValue compoundDsfv = new DatasetFieldCompoundValue();
int nonEmptyFields = 0;
for (DatasetFieldType cdsft : dsft.getChildDatasetFieldTypes()) {
String dsfName = cdsft.getName();
if (fileMetadataMap.get(dsfName) != null && !fileMetadataMap.get(dsfName).isEmpty()) {
logger.fine("Ingest Service: found extracted metadata for field " + dsfName + ", part of the compound field " + dsft.getName());
if (cdsft.isPrimitive()) {
// but maybe it'll change in the future.
if (!cdsft.isControlledVocabulary()) {
// TODO: can we have controlled vocabulary
// sub-fields inside compound fields?
DatasetField childDsf = new DatasetField();
childDsf.setDatasetFieldType(cdsft);
DatasetFieldValue newDsfv = new DatasetFieldValue(childDsf);
newDsfv.setValue((String) fileMetadataMap.get(dsfName).toArray()[0]);
childDsf.getDatasetFieldValues().add(newDsfv);
childDsf.setParentDatasetFieldCompoundValue(compoundDsfv);
compoundDsfv.getChildDatasetFields().add(childDsf);
nonEmptyFields++;
}
}
}
}
if (nonEmptyFields > 0) {
// actual parent for this sub-field:
for (DatasetField dsf : editVersion.getFlatDatasetFields()) {
if (dsf.getDatasetFieldType().equals(dsft)) {
// Now let's check that the dataset version doesn't already have
// this compound value - we are only interested in aggregating
// unique values. Note that we need to compare compound values
// as sets! -- i.e. all the sub fields in 2 compound fields
// must match in order for these 2 compounds to be recognized
// as "the same":
boolean alreadyExists = false;
for (DatasetFieldCompoundValue dsfcv : dsf.getDatasetFieldCompoundValues()) {
int matches = 0;
for (DatasetField cdsf : dsfcv.getChildDatasetFields()) {
String cdsfName = cdsf.getDatasetFieldType().getName();
String cdsfValue = cdsf.getDatasetFieldValues().get(0).getValue();
if (cdsfValue != null && !cdsfValue.equals("")) {
String extractedValue = (String) fileMetadataMap.get(cdsfName).toArray()[0];
logger.fine("values: existing: " + cdsfValue + ", extracted: " + extractedValue);
if (cdsfValue.equals(extractedValue)) {
matches++;
}
}
}
if (matches == nonEmptyFields) {
alreadyExists = true;
break;
}
}
if (!alreadyExists) {
// save this compound value, by attaching it to the
// version for proper cascading:
compoundDsfv.setParentDatasetField(dsf);
dsf.getDatasetFieldCompoundValues().add(compoundDsfv);
}
}
}
}
}
}
}
}
}
use of edu.harvard.iq.dataverse.DatasetField in project dataverse by IQSS.
the class UpdateDatasetVersionCommand method execute.
@Override
public DatasetVersion execute(CommandContext ctxt) throws CommandException {
Dataset ds = newVersion.getDataset();
ctxt.permissions().checkEditDatasetLock(ds, getRequest(), this);
DatasetVersion latest = ds.getLatestVersion();
if (latest == null) {
throw new IllegalCommandException("Dataset " + ds.getId() + " does not have a latest version.", this);
}
if (!latest.isDraft()) {
throw new IllegalCommandException("Cannot update a dataset version that's not a draft", this);
}
DatasetVersion edit = ds.getEditVersion();
edit.setDatasetFields(newVersion.getDatasetFields());
edit.setDatasetFields(edit.initDatasetFields());
Set<ConstraintViolation> constraintViolations = edit.validate();
if (!constraintViolations.isEmpty()) {
String validationFailedString = "Validation failed:";
for (ConstraintViolation constraintViolation : constraintViolations) {
validationFailedString += " " + constraintViolation.getMessage();
}
throw new IllegalCommandException(validationFailedString, this);
}
Iterator<DatasetField> dsfIt = edit.getDatasetFields().iterator();
while (dsfIt.hasNext()) {
if (dsfIt.next().removeBlankDatasetFieldValues()) {
dsfIt.remove();
}
}
Timestamp now = new Timestamp(new Date().getTime());
edit.setLastUpdateTime(now);
ds.setModificationTime(now);
DatasetVersion managed = ctxt.em().merge(edit);
boolean doNormalSolrDocCleanUp = true;
ctxt.index().indexDataset(managed.getDataset(), doNormalSolrDocCleanUp);
return managed;
}
use of edu.harvard.iq.dataverse.DatasetField in project dataverse by IQSS.
the class ImportServiceBean method doImportHarvestedDataset.
@TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, HarvestingClient harvestingClient, String harvestIdentifier, String metadataFormat, File metadataFile, PrintWriter cleanupLog) throws ImportException, IOException {
if (harvestingClient == null || harvestingClient.getDataverse() == null) {
throw new ImportException("importHarvestedDataset called wiht a null harvestingClient, or an invalid harvestingClient.");
}
Dataverse owner = harvestingClient.getDataverse();
Dataset importedDataset = null;
DatasetDTO dsDTO = null;
String json = null;
if ("ddi".equalsIgnoreCase(metadataFormat) || "oai_ddi".equals(metadataFormat) || metadataFormat.toLowerCase().matches("^oai_ddi.*")) {
try {
String xmlToParse = new String(Files.readAllBytes(metadataFile.toPath()));
// TODO:
// import type should be configurable - it should be possible to
// select whether you want to harvest with or without files,
// ImportType.HARVEST vs. ImportType.HARVEST_WITH_FILES
logger.fine("importing DDI " + metadataFile.getAbsolutePath());
dsDTO = importDDIService.doImport(ImportType.HARVEST_WITH_FILES, xmlToParse);
} catch (IOException | XMLStreamException | ImportException e) {
throw new ImportException("Failed to process DDI XML record: " + e.getClass() + " (" + e.getMessage() + ")");
}
} else if ("dc".equalsIgnoreCase(metadataFormat) || "oai_dc".equals(metadataFormat)) {
logger.fine("importing DC " + metadataFile.getAbsolutePath());
try {
String xmlToParse = new String(Files.readAllBytes(metadataFile.toPath()));
dsDTO = importGenericService.processOAIDCxml(xmlToParse);
} catch (IOException | XMLStreamException e) {
throw new ImportException("Failed to process Dublin Core XML record: " + e.getClass() + " (" + e.getMessage() + ")");
}
} else if ("dataverse_json".equals(metadataFormat)) {
// This is Dataverse metadata already formatted in JSON.
// Simply read it into a string, and pass to the final import further down:
logger.fine("Attempting to import custom dataverse metadata from file " + metadataFile.getAbsolutePath());
json = new String(Files.readAllBytes(metadataFile.toPath()));
} else {
throw new ImportException("Unsupported import metadata format: " + metadataFormat);
}
if (json == null) {
if (dsDTO != null) {
// convert DTO to Json,
Gson gson = new GsonBuilder().setPrettyPrinting().create();
json = gson.toJson(dsDTO);
logger.fine("JSON produced for the metadata harvested: " + json);
} else {
throw new ImportException("Failed to transform XML metadata format " + metadataFormat + " into a DatasetDTO");
}
}
JsonReader jsonReader = Json.createReader(new StringReader(json));
JsonObject obj = jsonReader.readObject();
// and call parse Json to read it into a dataset
try {
JsonParser parser = new JsonParser(datasetfieldService, metadataBlockService, settingsService);
parser.setLenient(true);
Dataset ds = parser.parseDataset(obj);
// For ImportType.NEW, if the metadata contains a global identifier, and it's not a protocol
// we support, it should be rejected.
// (TODO: ! - add some way of keeping track of supported protocols!)
// if (ds.getGlobalId() != null && !ds.getProtocol().equals(settingsService.getValueForKey(SettingsServiceBean.Key.Protocol, ""))) {
// throw new ImportException("Could not register id " + ds.getGlobalId() + ", protocol not supported");
// }
ds.setOwner(owner);
ds.getLatestVersion().setDatasetFields(ds.getLatestVersion().initDatasetFields());
// Check data against required contraints
List<ConstraintViolation<DatasetField>> violations = ds.getVersions().get(0).validateRequired();
if (!violations.isEmpty()) {
// For migration and harvest, add NA for missing required values
for (ConstraintViolation<DatasetField> v : violations) {
DatasetField f = v.getRootBean();
f.setSingleValue(DatasetField.NA_VALUE);
}
}
// Check data against validation constraints
// If we are migrating and "scrub migration data" is true we attempt to fix invalid data
// if the fix fails stop processing of this file by throwing exception
Set<ConstraintViolation> invalidViolations = ds.getVersions().get(0).validate();
ValidatorFactory factory = Validation.buildDefaultValidatorFactory();
Validator validator = factory.getValidator();
if (!invalidViolations.isEmpty()) {
for (ConstraintViolation<DatasetFieldValue> v : invalidViolations) {
DatasetFieldValue f = v.getRootBean();
boolean fixed = false;
boolean converted = false;
// TODO: Is this scrubbing something we want to continue doing?
if (settingsService.isTrueForKey(SettingsServiceBean.Key.ScrubMigrationData, false)) {
fixed = processMigrationValidationError(f, cleanupLog, metadataFile.getName());
converted = true;
if (fixed) {
Set<ConstraintViolation<DatasetFieldValue>> scrubbedViolations = validator.validate(f);
if (!scrubbedViolations.isEmpty()) {
fixed = false;
}
}
}
if (!fixed) {
String msg = "Data modified - File: " + metadataFile.getName() + "; Field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; " + "Invalid value: '" + f.getValue() + "'" + " Converted Value:'" + DatasetField.NA_VALUE + "'";
cleanupLog.println(msg);
f.setValue(DatasetField.NA_VALUE);
}
}
}
// this dataset:
if (StringUtils.isEmpty(ds.getGlobalId())) {
throw new ImportException("The harvested metadata record with the OAI server identifier " + harvestIdentifier + " does not contain a global unique identifier that we could recognize, skipping.");
}
ds.setHarvestedFrom(harvestingClient);
ds.setHarvestIdentifier(harvestIdentifier);
Dataset existingDs = datasetService.findByGlobalId(ds.getGlobalId());
if (existingDs != null) {
// we are just going to skip it!
if (existingDs.getOwner() != null && !owner.getId().equals(existingDs.getOwner().getId())) {
throw new ImportException("The dataset with the global id " + ds.getGlobalId() + " already exists, in the dataverse " + existingDs.getOwner().getAlias() + ", skipping.");
}
// skip it also:
if (!existingDs.isHarvested()) {
throw new ImportException("A LOCAL dataset with the global id " + ds.getGlobalId() + " already exists in this dataverse; skipping.");
}
// We will replace the current version with the imported version.
if (existingDs.getVersions().size() != 1) {
throw new ImportException("Error importing Harvested Dataset, existing dataset has " + existingDs.getVersions().size() + " versions");
}
// Purge all the SOLR documents associated with this client from the
// index server:
indexService.deleteHarvestedDocuments(existingDs);
// DeleteFileCommand on them.
for (DataFile harvestedFile : existingDs.getFiles()) {
DataFile merged = em.merge(harvestedFile);
em.remove(merged);
harvestedFile = null;
}
// TODO:
// Verify what happens with the indexed files in SOLR?
// are they going to be overwritten by the reindexing of the dataset?
existingDs.setFiles(null);
Dataset merged = em.merge(existingDs);
engineSvc.submit(new DestroyDatasetCommand(merged, dataverseRequest));
importedDataset = engineSvc.submit(new CreateDatasetCommand(ds, dataverseRequest, false, ImportType.HARVEST));
} else {
importedDataset = engineSvc.submit(new CreateDatasetCommand(ds, dataverseRequest, false, ImportType.HARVEST));
}
} catch (JsonParseException | ImportException | CommandException ex) {
logger.fine("Failed to import harvested dataset: " + ex.getClass() + ": " + ex.getMessage());
FileOutputStream savedJsonFileStream = new FileOutputStream(new File(metadataFile.getAbsolutePath() + ".json"));
byte[] jsonBytes = json.getBytes();
int i = 0;
while (i < jsonBytes.length) {
int chunkSize = i + 8192 <= jsonBytes.length ? 8192 : jsonBytes.length - i;
savedJsonFileStream.write(jsonBytes, i, chunkSize);
i += chunkSize;
savedJsonFileStream.flush();
}
savedJsonFileStream.close();
logger.info("JSON produced saved in " + metadataFile.getAbsolutePath() + ".json");
throw new ImportException("Failed to import harvested dataset: " + ex.getClass() + " (" + ex.getMessage() + ")", ex);
}
return importedDataset;
}
Aggregations