Search in sources :

Example 1 with ExtensionMapping

use of org.gbif.ipt.model.ExtensionMapping in project ipt by gbif.

the class ExtensionManagerImpl method migrateResourceToNewExtensionVersion.

/**
 * Migrate a resource's extension mappings to an extension to a newer version of that extension.
 *
 * @param r       resource whose mappings must be migrated
 * @param current extension
 * @param newer   newer version of extension to migrate mappings to
 */
protected void migrateResourceToNewExtensionVersion(Resource r, Extension current, Extension newer) {
    // sanity check that the current and newer extensions share same rowType
    if (!current.getRowType().equalsIgnoreCase(newer.getRowType()) || r.getMappings(current.getRowType()).isEmpty()) {
        throw new IllegalStateException();
    }
    LOG.info("Migrating " + r.getShortname() + " mappings to extension " + current.getRowType() + " to latest extension version");
    // populate various set to keep track of how many terms were deprecated, how terms' vocabulary was updated, etc
    Set<ExtensionProperty> deprecated = new HashSet<>();
    Set<ExtensionProperty> vocabulariesRemoved = new HashSet<>();
    Set<ExtensionProperty> vocabulariesUnchanged = new HashSet<>();
    Set<ExtensionProperty> vocabulariesUpdated = new HashSet<>();
    for (ExtensionProperty property : current.getProperties()) {
        // newer extension still contain this property?
        if (!newer.hasProperty(property.qualifiedName())) {
            deprecated.add(property);
        } else // if so, check if this property uses a vocabulary, and whether the newer extension uses a newer version of it
        {
            if (property.getVocabulary() != null) {
                Vocabulary v1 = property.getVocabulary();
                Vocabulary v2 = newer.getProperty(property.qualifiedName()).getVocabulary();
                // case 1: vocabulary removed in newer version
                if (v2 == null) {
                    vocabulariesRemoved.add(property);
                } else // case 2: vocabulary versions are unchanged between versions
                if (v1.getUriString().equalsIgnoreCase(v2.getUriString())) {
                    vocabulariesUnchanged.add(property);
                } else // case 3: vocabulary has been updated in newer version
                if (!v1.getUriString().equalsIgnoreCase(v2.getUriString())) {
                    vocabulariesUpdated.add(property);
                }
            }
        }
    }
    LOG.debug(deprecated.size() + " properties have been deprecated in the newer version");
    LOG.debug(vocabulariesRemoved.size() + " properties in the newer version of extension no longer use a vocabulary");
    LOG.debug(vocabulariesUnchanged.size() + " properties in the newer version of extension use the same vocabulary");
    LOG.debug(vocabulariesUpdated.size() + " properties in the newer version of extension use a newer vocabulary");
    // set of new terms (terms to add)
    Set<ExtensionProperty> added = new HashSet<>();
    for (ExtensionProperty property : newer.getProperties()) {
        // older extension contain this property?
        if (!current.hasProperty(property.qualifiedName())) {
            added.add(property);
        }
    }
    LOG.debug("Newer version of extension has " + added.size() + " new properties");
    for (ExtensionMapping extensionMapping : r.getMappings(current.getRowType())) {
        migrateExtensionMapping(extensionMapping, newer, deprecated);
    }
}
Also used : ExtensionProperty(org.gbif.ipt.model.ExtensionProperty) Vocabulary(org.gbif.ipt.model.Vocabulary) ExtensionMapping(org.gbif.ipt.model.ExtensionMapping) HashSet(java.util.HashSet)

Example 2 with ExtensionMapping

use of org.gbif.ipt.model.ExtensionMapping in project ipt by gbif.

the class ResourceManagerImpl method createFromArchive.

private Resource createFromArchive(String shortname, File dwca, User creator, ActionLogger alog) throws AlreadyExistingException, ImportException, InvalidFilenameException {
    Objects.requireNonNull(shortname);
    // check if existing already
    if (get(shortname) != null) {
        throw new AlreadyExistingException();
    }
    Resource resource;
    try {
        // try to read dwca
        Archive arch = DwcFiles.fromLocation(dwca.toPath());
        if (arch.getCore() == null) {
            alog.error("manage.resource.create.core.invalid");
            throw new ImportException("Darwin Core Archive is invalid and does not have a core mapping");
        }
        if (arch.getCore().getRowType() == null) {
            alog.error("manage.resource.create.core.invalid.rowType");
            throw new ImportException("Darwin Core Archive is invalid, core mapping has no rowType");
        }
        // keep track of source files as a dwca might refer to the same source file multiple times
        Map<String, TextFileSource> sources = new HashMap<>();
        // determine core type for the resource based on the rowType
        Term coreRowType = arch.getCore().getRowType();
        CoreRowType resourceType;
        if (coreRowType.equals(DwcTerm.Taxon)) {
            resourceType = CoreRowType.CHECKLIST;
        } else if (coreRowType.equals(DwcTerm.Occurrence)) {
            resourceType = CoreRowType.OCCURRENCE;
        } else if (coreRowType.equals(DwcTerm.Event)) {
            resourceType = CoreRowType.SAMPLINGEVENT;
        } else {
            resourceType = CoreRowType.OTHER;
        }
        // create new resource
        resource = create(shortname, resourceType.toString().toUpperCase(Locale.ENGLISH), creator);
        // read core source+mappings
        TextFileSource s = importSource(resource, arch.getCore());
        sources.put(arch.getCore().getLocation(), s);
        ExtensionMapping map = importMappings(alog, arch.getCore(), s);
        resource.addMapping(map);
        // the core must contain an id element that indicates the identifier for a record
        if (!arch.getExtensions().isEmpty()) {
            if (map.getIdColumn() == null) {
                alog.error("manage.resource.create.core.invalid.id");
                throw new ImportException("Darwin Core Archive is invalid, core mapping has no id element");
            }
            // read extension sources+mappings
            for (ArchiveFile ext : arch.getExtensions()) {
                if (sources.containsKey(ext.getLocation())) {
                    s = sources.get(ext.getLocation());
                    LOG.debug("SourceBase " + s.getName() + " shared by multiple extensions");
                } else {
                    s = importSource(resource, ext);
                    sources.put(ext.getLocation(), s);
                }
                map = importMappings(alog, ext, s);
                if (map.getIdColumn() == null) {
                    alog.error("manage.resource.create.core.invalid.coreid");
                    throw new ImportException("Darwin Core Archive is invalid, extension mapping has no coreId element");
                }
                // ensure the extension contains a coreId term mapping with the correct coreId index
                if (resource.getCoreRowType() != null) {
                    updateExtensionCoreIdMapping(map, resource.getCoreRowType());
                }
                resource.addMapping(map);
            }
        }
        // try to read metadata
        Eml eml = readMetadata(resource.getShortname(), arch, alog);
        if (eml != null) {
            resource.setEml(eml);
        }
        // finally persist the whole thing
        save(resource);
        alog.info("manage.resource.create.success", new String[] { StringUtils.trimToEmpty(resource.getCoreRowType()), String.valueOf(resource.getSources().size()), String.valueOf(resource.getMappings().size()) });
    } catch (UnsupportedArchiveException | InvalidConfigException | IOException e) {
        alog.warn(e.getMessage(), e);
        throw new ImportException(e);
    }
    return resource;
}
Also used : Archive(org.gbif.dwc.Archive) HashMap(java.util.HashMap) ArrayListValuedHashMap(org.apache.commons.collections4.multimap.ArrayListValuedHashMap) Eml(org.gbif.metadata.eml.Eml) AlreadyExistingException(org.gbif.ipt.service.AlreadyExistingException) Resource(org.gbif.ipt.model.Resource) TextFileSource(org.gbif.ipt.model.TextFileSource) Term(org.gbif.dwc.terms.Term) DwcTerm(org.gbif.dwc.terms.DwcTerm) InvalidConfigException(org.gbif.ipt.service.InvalidConfigException) IOException(java.io.IOException) UnsupportedArchiveException(org.gbif.dwc.UnsupportedArchiveException) ArchiveFile(org.gbif.dwc.ArchiveFile) ImportException(org.gbif.ipt.service.ImportException) ExtensionMapping(org.gbif.ipt.model.ExtensionMapping) CoreRowType(org.gbif.ipt.model.Resource.CoreRowType)

Example 3 with ExtensionMapping

use of org.gbif.ipt.model.ExtensionMapping in project ipt by gbif.

the class ResourceManagerImpl method loadFromDir.

/**
 * Reads a complete resource configuration (resource config & eml) from the resource config folder
 * and returns the Resource instance for the internal in memory cache.
 */
private Resource loadFromDir(File resourceDir, @Nullable User creator, ActionLogger alog) throws InvalidConfigException {
    if (resourceDir.exists()) {
        // load full configuration from resource.xml and eml.xml files
        String shortname = resourceDir.getName();
        try {
            File cfgFile = dataDir.resourceFile(shortname, PERSISTENCE_FILE);
            InputStream input = new FileInputStream(cfgFile);
            Resource resource = (Resource) xstream.fromXML(input);
            // populate missing creator - it cannot be null! (this fixes issue #1309)
            if (creator != null && resource.getCreator() == null) {
                resource.setCreator(creator);
                LOG.warn("On load, populated missing creator for resource: " + shortname);
            }
            // non existing users end up being a NULL in the set, so remove them
            // shouldnt really happen - but people can even manually cause a mess
            resource.getManagers().remove(null);
            // therefore if a non-Taxon core extension is using auto-generated IDs, the coreID is set to No ID (-99)
            for (ExtensionMapping ext : resource.getMappings()) {
                Extension x = ext.getExtension();
                if (x == null) {
                    alog.warn("manage.resource.create.extension.null");
                    throw new InvalidConfigException(TYPE.INVALID_EXTENSION, "Resource references non-existent extension");
                } else if (extensionManager.get(x.getRowType()) == null) {
                    alog.warn("manage.resource.create.rowType.null", new String[] { x.getRowType() });
                    throw new InvalidConfigException(TYPE.INVALID_EXTENSION, "Resource references non-installed extension");
                }
                // is the ExtensionMapping of core type, not taxon core type, and uses a coreIdColumn mapping?
                if (ext.isCore() && !ext.isTaxonCore() && ext.getIdColumn() != null) {
                    if (ext.getIdColumn().equals(ExtensionMapping.IDGEN_LINE_NUMBER) || ext.getIdColumn().equals(ExtensionMapping.IDGEN_UUID)) {
                        ext.setIdColumn(ExtensionMapping.NO_ID);
                    }
                }
            }
            // shortname persists as folder name, so xstream doesnt handle this:
            resource.setShortname(shortname);
            // infer coreType if null
            if (resource.getCoreType() == null) {
                inferCoreType(resource);
            }
            // standardize subtype if not null
            if (resource.getSubtype() != null) {
                standardizeSubtype(resource);
            }
            // add proper source file pointer
            for (Source src : resource.getSources()) {
                src.setResource(resource);
                if (src instanceof FileSource) {
                    FileSource frSrc = (FileSource) src;
                    frSrc.setFile(dataDir.sourceFile(resource, frSrc));
                }
            }
            // pre v2.2 resources: set IdentifierStatus if null
            if (resource.getIdentifierStatus() == null) {
                resource.setIdentifierStatus(IdentifierStatus.UNRESERVED);
            }
            // load eml (this must be done before trying to convert version below)
            loadEml(resource);
            // pre v2.2 resources: convert resource version from integer to major_version.minor_version style
            // also convert/rename eml, rtf, and dwca versioned files also
            BigDecimal converted = convertVersion(resource);
            if (converted != null) {
                updateResourceVersion(resource, resource.getEmlVersion(), converted);
            }
            // pre v2.2 resources: construct a VersionHistory for last published version (if appropriate)
            VersionHistory history = constructVersionHistoryForLastPublishedVersion(resource);
            if (history != null) {
                resource.addVersionHistory(history);
            }
            // pre v2.2.1 resources: rename dwca.zip to dwca-18.0.zip (where 18.0 is the last published version for example)
            if (resource.getLastPublishedVersionsVersion() != null) {
                renameDwcaToIncludeVersion(resource, resource.getLastPublishedVersionsVersion());
            }
            // update EML with latest resource basics (version and GUID)
            syncEmlWithResource(resource);
            LOG.debug("Read resource configuration for " + shortname);
            return resource;
        } catch (Exception e) {
            LOG.error("Cannot read resource configuration for " + shortname, e);
            throw new InvalidConfigException(TYPE.RESOURCE_CONFIG, "Cannot read resource configuration for " + shortname + ": " + e.getMessage());
        }
    }
    return null;
}
Also used : FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) TextFileSource(org.gbif.ipt.model.TextFileSource) ExcelFileSource(org.gbif.ipt.model.ExcelFileSource) FileSource(org.gbif.ipt.model.FileSource) Resource(org.gbif.ipt.model.Resource) InvalidConfigException(org.gbif.ipt.service.InvalidConfigException) VersionHistory(org.gbif.ipt.model.VersionHistory) FileInputStream(java.io.FileInputStream) TextFileSource(org.gbif.ipt.model.TextFileSource) ExcelFileSource(org.gbif.ipt.model.ExcelFileSource) SqlSource(org.gbif.ipt.model.SqlSource) FileSource(org.gbif.ipt.model.FileSource) Source(org.gbif.ipt.model.Source) UrlSource(org.gbif.ipt.model.UrlSource) BigDecimal(java.math.BigDecimal) InvalidFilenameException(org.gbif.ipt.service.InvalidFilenameException) DoiExistsException(org.gbif.doi.service.DoiExistsException) ImportException(org.gbif.ipt.service.ImportException) GeneratorException(org.gbif.ipt.task.GeneratorException) PublicationException(org.gbif.ipt.service.PublicationException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) DoiException(org.gbif.doi.service.DoiException) RegistryException(org.gbif.ipt.service.RegistryException) UnsupportedArchiveException(org.gbif.dwc.UnsupportedArchiveException) InvalidMetadataException(org.gbif.doi.service.InvalidMetadataException) DeletionNotAllowedException(org.gbif.ipt.service.DeletionNotAllowedException) FileNotFoundException(java.io.FileNotFoundException) InvalidConfigException(org.gbif.ipt.service.InvalidConfigException) DocumentException(com.lowagie.text.DocumentException) AlreadyExistingException(org.gbif.ipt.service.AlreadyExistingException) Extension(org.gbif.ipt.model.Extension) ExtensionMapping(org.gbif.ipt.model.ExtensionMapping) ArchiveFile(org.gbif.dwc.ArchiveFile) File(java.io.File)

Example 4 with ExtensionMapping

use of org.gbif.ipt.model.ExtensionMapping in project ipt by gbif.

the class GenerateDwca method addFieldsToArchive.

/**
 * First we need to find the union of all terms mapped (in all files) for a single Extension. Then make each mapped
 * term a field in the final archive. Static/default mappings are not stored for a field, since they are not
 * expressed in meta.xml but instead get written to the data file.
 *
 * @param mappings list of ExtensionMapping
 * @param af ArchiveFile
 *
 * @return set of conceptTerms that have been mapped (in all files) for a single Extension
 */
private Set<Term> addFieldsToArchive(List<ExtensionMapping> mappings, ArchiveFile af) throws GeneratorException {
    Set<Term> mappedConceptTerms = new HashSet<>();
    for (ExtensionMapping m : mappings) {
        // multi-value field delimiter, part of each source data configuration
        String delimitedBy = StringUtils.trimToNull(m.getSource().getMultiValueFieldsDelimitedBy());
        for (PropertyMapping pm : m.getFields()) {
            Term term = TERM_FACTORY.findTerm(pm.getTerm().qualifiedName());
            // ensure Extension has concept term
            if (term != null && m.getExtension().getProperty(term) != null) {
                if (af.hasTerm(term)) {
                    ArchiveField field = af.getField(term);
                    mappedConceptTerms.add(term);
                    // multi-value delimiter must be same across all sources
                    if (field.getDelimitedBy() != null && !field.getDelimitedBy().equals(delimitedBy)) {
                        throw new GeneratorException("More than one type of multi-value field delimiter is being used in the source files mapped to the " + m.getExtension().getName() + " extension. Please either ensure all source files mapped to this extension use the same delimiter, otherwise just leave the delimiter blank.");
                    }
                } else {
                    if ((pm.getIndex() != null && pm.getIndex() >= 0) || pm.getIndex() == null) {
                        log.debug("Handling property mapping for term: " + term.qualifiedName() + " (index " + pm.getIndex() + ")");
                        af.addField(buildField(term, delimitedBy));
                        mappedConceptTerms.add(term);
                    }
                }
            }
        }
        // if Extension has datasetID concept term, check if resource DOI should be used as value for mapping
        ExtensionProperty ep = m.getExtension().getProperty(DwcTerm.datasetID.qualifiedName());
        if (ep != null && m.isDoiUsedForDatasetId()) {
            log.debug("Detected that resource DOI to be used as value for datasetID mapping..");
            // include datasetID field in ArchiveFile
            ArchiveField f = buildField(DwcTerm.datasetID, null);
            af.addField(f);
            // include datasetID field mapping in ExtensionMapping
            PropertyMapping pm = new PropertyMapping(f);
            pm.setTerm(ep);
            m.getFields().add(pm);
            // include datasetID in set of all terms mapped for Extension
            mappedConceptTerms.add(DwcTerm.datasetID);
        }
    }
    return mappedConceptTerms;
}
Also used : ExtensionProperty(org.gbif.ipt.model.ExtensionProperty) ExtensionMapping(org.gbif.ipt.model.ExtensionMapping) PropertyMapping(org.gbif.ipt.model.PropertyMapping) DwcTerm(org.gbif.dwc.terms.DwcTerm) Term(org.gbif.dwc.terms.Term) ArchiveField(org.gbif.dwc.ArchiveField) HashSet(java.util.HashSet)

Example 5 with ExtensionMapping

use of org.gbif.ipt.model.ExtensionMapping in project ipt by gbif.

the class GenerateDwca method addDataFile.

/**
 * Adds a single data file for a list of extension mappings that must all be mapped to the same extension.
 * </br>
 * The ID column is always the 1st column (index 0) and is always equal to the core record identifier that has been
 * mapped (e.g. occurrenceID, taxonID, etc).
 *
 * @param mappings list of ExtensionMapping
 * @param rowLimit maximum number of rows to write
 * @throws IllegalArgumentException if not all mappings are mapped to the same extension
 * @throws InterruptedException if the thread was interrupted
 * @throws IOException if problems occurred while persisting new data files
 * @throws GeneratorException if any problem was encountered writing data file
 */
public void addDataFile(List<ExtensionMapping> mappings, @Nullable Integer rowLimit) throws IOException, IllegalArgumentException, InterruptedException, GeneratorException {
    checkForInterruption();
    if (mappings == null || mappings.isEmpty()) {
        return;
    }
    // update reporting
    currRecords = 0;
    currRecordsSkipped = 0;
    Extension ext = mappings.get(0).getExtension();
    currExtension = ext.getTitle();
    // verify that all mappings share this extension
    for (ExtensionMapping m : mappings) {
        if (!ext.equals(m.getExtension())) {
            throw new IllegalArgumentException("All mappings for a single data file need to be mapped to the same extension: " + ext.getRowType());
        }
    }
    // create new tab file with the help of the Archive class representing the core file or an extension
    ArchiveFile af = ArchiveFile.buildTabFile();
    af.setRowType(TERM_FACTORY.findTerm(ext.getRowType()));
    af.setEncoding(CHARACTER_ENCODING);
    af.setDateFormat("YYYY-MM-DD");
    // in the generated file column 0 will be the id column
    ArchiveField idField = new ArchiveField();
    idField.setIndex(ID_COLUMN_INDEX);
    af.setId(idField);
    // find the union of all terms mapped and make them a field in the final archive
    Set<Term> mappedConceptTerms = addFieldsToArchive(mappings, af);
    // retrieve the ordered list of mapped ExtensionProperty
    List<ExtensionProperty> propertyList = getOrderedMappedExtensionProperties(ext, mappedConceptTerms);
    // reassign indexes ordered by Extension
    assignIndexesOrderedByExtension(propertyList, af);
    // total column count is equal to id column + mapped columns
    int totalColumns = 1 + propertyList.size();
    // create file name from extension name, with incremental suffix to resolve name conflicts (e.g. taxon.txt,
    // taxon2.txt, taxon3.txt)
    String extensionName = (ext.getName() == null) ? "f" : ext.getName().toLowerCase().replaceAll("\\s", "_");
    String fn = createFileName(dwcaFolder, extensionName);
    // open new file writer for single data file
    File dataFile = new File(dwcaFolder, fn);
    // ready to go though each mapping and dump the data
    try (Writer writer = org.gbif.utils.file.FileUtils.startNewUtf8File(dataFile)) {
        af.addLocation(dataFile.getName());
        addMessage(Level.INFO, "Start writing data file for " + currExtension);
        boolean headerWritten = false;
        for (ExtensionMapping m : mappings) {
            // prepare index ordered list of all output columns apart from id column
            PropertyMapping[] inCols = new PropertyMapping[totalColumns];
            for (ArchiveField f : af.getFields().values()) {
                if (f.getIndex() != null && f.getIndex() > ID_COLUMN_INDEX) {
                    inCols[f.getIndex()] = m.getField(f.getTerm().qualifiedName());
                }
            }
            // write header line 1 time only to file
            if (!headerWritten) {
                writeHeaderLine(propertyList, totalColumns, af, writer);
                headerWritten = true;
            }
            // write data (records) to file
            dumpData(writer, inCols, m, totalColumns, rowLimit, resource.getDoi());
            // store record number by extension rowType
            recordsByExtension.put(ext.getRowType(), currRecords);
        }
    } catch (IOException e) {
        // some error writing this file, report
        log.error("Fatal DwC-A Generator Error encountered while writing header line to data file", e);
        // set last error report!
        setState(e);
        throw new GeneratorException("Error writing header line to data file", e);
    }
    // add archive file to archive
    if (resource.getCoreRowType() != null && resource.getCoreRowType().equalsIgnoreCase(ext.getRowType())) {
        archive.setCore(af);
    } else {
        archive.addExtension(af);
    }
    // final reporting
    addMessage(Level.INFO, "Data file written for " + currExtension + " with " + currRecords + " records and " + totalColumns + " columns");
    // how many records were skipped?
    if (currRecordsSkipped > 0) {
        addMessage(Level.WARN, "!!! " + currRecordsSkipped + " records were skipped for " + currExtension + " due to errors interpreting line, or because the line was empty");
    }
}
Also used : DwcTerm(org.gbif.dwc.terms.DwcTerm) Term(org.gbif.dwc.terms.Term) IOException(java.io.IOException) ArchiveFile(org.gbif.dwc.ArchiveFile) Extension(org.gbif.ipt.model.Extension) ExtensionProperty(org.gbif.ipt.model.ExtensionProperty) ExtensionMapping(org.gbif.ipt.model.ExtensionMapping) PropertyMapping(org.gbif.ipt.model.PropertyMapping) ArchiveFile(org.gbif.dwc.ArchiveFile) File(java.io.File) ArchiveField(org.gbif.dwc.ArchiveField) MetaDescriptorWriter(org.gbif.dwc.MetaDescriptorWriter) PrintWriter(java.io.PrintWriter) Writer(java.io.Writer) StringWriter(java.io.StringWriter)

Aggregations

ExtensionMapping (org.gbif.ipt.model.ExtensionMapping)22 Extension (org.gbif.ipt.model.Extension)14 PropertyMapping (org.gbif.ipt.model.PropertyMapping)14 Resource (org.gbif.ipt.model.Resource)12 ExtensionProperty (org.gbif.ipt.model.ExtensionProperty)9 File (java.io.File)7 HashSet (java.util.HashSet)7 TextFileSource (org.gbif.ipt.model.TextFileSource)6 Test (org.junit.jupiter.api.Test)6 ArrayList (java.util.ArrayList)5 ArchiveFile (org.gbif.dwc.ArchiveFile)5 Term (org.gbif.dwc.terms.Term)5 IOException (java.io.IOException)4 DwcTerm (org.gbif.dwc.terms.DwcTerm)4 InvalidConfigException (org.gbif.ipt.service.InvalidConfigException)4 ResourceManager (org.gbif.ipt.service.manage.ResourceManager)4 ArchiveField (org.gbif.dwc.ArchiveField)3 BeforeEach (org.junit.jupiter.api.BeforeEach)3 HashMap (java.util.HashMap)2 TreeSet (java.util.TreeSet)2