Search in sources :

Example 1 with Schema

use of life.catalogue.csv.Schema in project backend by CatalogueOfLife.

the class DwcaReader method buildSchema.

private void buildSchema(XMLStreamReader2 parser, boolean core) throws XMLStreamException, IOException {
    // rowType
    final Term rowType = VocabularyUtils.TF.findClassTerm(attr(parser, "rowType"));
    if (core) {
        coreRowType = rowType;
    }
    // encoding
    String enc = attr(parser, "encoding");
    // delimiter
    final CsvParserSettings set = CSV.clone();
    String val = unescapeBackslash(attr(parser, "fieldsTerminatedBy"));
    set.setDelimiterDetectionEnabled(true);
    if (val != null) {
        if (val.length() != 1) {
            throw new IllegalArgumentException("fieldsTerminatedBy needs to be a single char");
        } else {
            set.setDelimiterDetectionEnabled(false);
            set.getFormat().setDelimiter(val.charAt(0));
            LOG.debug("Use delimiter {} for {}", StringEscapeUtils.escapeJava(val), rowType);
        }
    }
    val = unescapeBackslash(attr(parser, "fieldsEnclosedBy"));
    set.setQuoteDetectionEnabled(false);
    if (val == null) {
        val = String.valueOf('\0');
    }
    if (val.length() != 1) {
        throw new IllegalArgumentException("fieldsEnclosedBy needs to be a single char");
    } else {
        LOG.debug("Use quote char {} for {}", val, rowType);
        set.getFormat().setQuote(val.charAt(0));
    }
    // we ignore linesTerminatedBy
    // Its quite often wrong and people dont really use anything else than \n \r!
    set.setLineSeparatorDetectionEnabled(true);
    // setAttrIfExists(parser, "linesTerminatedBy", set.getFormat()::setLineSeparator);
    val = attr(parser, "ignoreHeaderLines");
    if (val != null) {
        try {
            set.setNumberOfRowsToSkip(Long.parseLong(val));
        } catch (NumberFormatException e) {
            throw new IllegalArgumentException("ignoreHeaderLines needs to be a valid integer");
        }
    }
    // parse fields & file
    Path file = resolve(attr(parser, "encoding"));
    List<Schema.Field> fields = Lists.newArrayList();
    int event;
    boolean stop = false;
    StringBuilder text = new StringBuilder();
    while (!stop) {
        event = parser.next();
        stop = event == XMLStreamConstants.END_DOCUMENT;
        switch(event) {
            case XMLStreamConstants.START_ELEMENT:
                text = new StringBuilder();
                boolean id = false;
                switch(parser.getLocalName()) {
                    case "id":
                    case "coreId":
                    case "coreid":
                        id = true;
                    case "field":
                        buildField(parser, id).ifPresent(fields::add);
                        break;
                }
                break;
            case XMLStreamConstants.END_ELEMENT:
                switch(parser.getLocalName()) {
                    case "location":
                        file = resolve(text.toString());
                        break;
                    case "core":
                    case "extension":
                        stop = true;
                        break;
                }
                break;
            case XMLStreamConstants.CHARACTERS:
                if (parser.hasText()) {
                    text.append(parser.getText().trim());
                }
                break;
        }
    }
    // final encoding
    Charset charset;
    try {
        charset = Charset.forName(enc);
    } catch (IllegalArgumentException e) {
        try (CharsetDetectingStream in = CharsetDetectingStream.create(Files.newInputStream(file))) {
            charset = in.getCharset();
            LOG.debug("Use encoding {} for file {}", charset, PathUtils.getFilename(file));
        }
        LOG.warn("Bad charset encoding {} specified, using {}", enc, charset);
    }
    Schema s = new Schema(file, rowType, charset, set, fields);
    LOG.debug("Found schema {}", s);
    schemas.put(rowType, s);
}
Also used : Path(java.nio.file.Path) Schema(life.catalogue.csv.Schema) Charset(java.nio.charset.Charset) ColdpTerm(life.catalogue.coldp.ColdpTerm) DwcUnofficialTerm(life.catalogue.coldp.DwcUnofficialTerm) CsvParserSettings(com.univocity.parsers.csv.CsvParserSettings) CharsetDetectingStream(life.catalogue.common.io.CharsetDetectingStream)

Example 2 with Schema

use of life.catalogue.csv.Schema in project backend by CatalogueOfLife.

the class DwcaReader method validate.

@Override
protected void validate() throws NormalizationFailedException.SourceInvalidException {
    super.validate();
    // no checks
    if (coreRowType != DwcTerm.Taxon) {
        throw new NormalizationFailedException.SourceInvalidException("No Taxon core, not a checklist?");
    }
    // check for a minimal parsed name
    final Schema core = schema(DwcTerm.Taxon).get();
    if ((core.hasTerm(DwcTerm.genus) || core.hasTerm(DwcTerm.genericName)) && core.hasTerm(DwcTerm.specificEpithet)) {
        mappingFlags.setParsedNameMapped(true);
    }
    // make sure either scientificName or genus & specificEpithet are mapped
    if (!core.hasTerm(DwcTerm.scientificName)) {
        LOG.warn("No scientificName mapped");
        if (!mappingFlags.isParsedNameMapped()) {
            // no name to work with!!!
            throw new NormalizationFailedException.SourceInvalidException("No scientificName nor parsed name mapped");
        } else {
            // warn if there is no author mapped for a parsed name
            if (!core.hasTerm(DwcTerm.scientificNameAuthorship)) {
                LOG.warn("No scientificNameAuthorship mapped for parsed name");
            }
        }
    }
    // warn if highly recommended terms are missing
    if (!core.hasTerm(DwcTerm.taxonRank)) {
        LOG.warn("No taxonRank mapped");
    }
    // check if taxonID should be used, not the generic ID
    if (core.hasTerm(DwcTerm.taxonID) && (!core.hasTerm(DwcaTerm.ID) || !core.field(DwcaTerm.ID).index.equals(core.field(DwcTerm.taxonID).index))) {
        LOG.info("Use taxonID instead of ID");
        mappingFlags.setTaxonId(true);
    }
    // multi values in use, e.g. for acceptedID?
    for (Schema.Field f : core.columns) {
        if (!Strings.isNullOrEmpty(f.delimiter)) {
            mappingFlags.getMultiValueDelimiters().put(f.term, Splitter.on(f.delimiter).omitEmptyStrings());
        }
    }
    for (Term t : DwcTerm.HIGHER_RANKS) {
        if (core.hasTerm(t)) {
            mappingFlags.setDenormedClassificationMapped(true);
            break;
        }
    }
    if (core.hasTerm(AcefTerm.Superfamily)) {
        mappingFlags.setDenormedClassificationMapped(true);
    }
    if (core.hasTerm(DwcTerm.parentNameUsageID) || core.hasTerm(DwcTerm.parentNameUsage)) {
        mappingFlags.setParentNameMapped(true);
    }
    if (core.hasTerm(DwcTerm.acceptedNameUsageID) || core.hasTerm(DwcTerm.acceptedNameUsage)) {
        mappingFlags.setAcceptedNameMapped(true);
    } else {
        if (core.hasTerm(AcefTerm.AcceptedTaxonID)) {
            // this sometimes gets confused with dwc - translate into dwc as we read dwc archives here
            // as schema and all fields are final we create a copyTaxon of the entire schema here
            Schema.Field f = core.field(AcefTerm.AcceptedTaxonID);
            Schema.Field f2 = new Schema.Field(DwcTerm.acceptedNameUsageID, f.value, f.index, f.delimiter);
            List<Schema.Field> updatedColumns = Lists.newArrayList(core.columns);
            updatedColumns.set(updatedColumns.indexOf(f), f2);
            Schema s2 = new Schema(core.file, core.rowType, core.encoding, core.settings, updatedColumns);
            putSchema(s2);
            mappingFlags.setAcceptedNameMapped(true);
        } else {
            LOG.warn("No accepted name terms mapped");
        }
    }
    if (core.hasTerm(DwcTerm.originalNameUsageID) || core.hasTerm(DwcTerm.originalNameUsage)) {
        mappingFlags.setOriginalNameMapped(true);
    }
    // any classification?
    if (!mappingFlags.isParentNameMapped() && !mappingFlags.isDenormedClassificationMapped()) {
        LOG.warn("No higher classification mapped");
    }
    // metadata file present?
    if (metadataFile != null && !Files.exists(metadataFile)) {
        LOG.warn("Metadata file {} does not exist", metadataFile);
        metadataFile = null;
    }
// TODO: validate extensions:
// vernacular name: vernacularName
// distribution: some area (locationID, countryCode, etc)
}
Also used : Schema(life.catalogue.csv.Schema) ColdpTerm(life.catalogue.coldp.ColdpTerm) DwcUnofficialTerm(life.catalogue.coldp.DwcUnofficialTerm)

Example 3 with Schema

use of life.catalogue.csv.Schema in project backend by CatalogueOfLife.

the class DwcaReader method discoverSchemas.

/**
 * First tries to find and read a meta.xml file.
 * If none is found all potential txt files are scanned.
 *
 * @param termPrefix optional preferred term namespace prefix to use when looking up class & property terms
 * @throws IOException
 */
@Override
protected void discoverSchemas(String termPrefix) throws IOException {
    Path eml = resolve(EML_FN);
    if (Files.exists(eml)) {
        metadataFile = eml;
    }
    Path meta = resolve(META_FN);
    if (Files.exists(meta)) {
        readFromMeta(meta);
    } else {
        super.discoverSchemas(termPrefix);
        // add artificial id terms for known rowType id pairs
        for (Schema s : schemas.values()) {
            if (!s.hasTerm(DwcaTerm.ID)) {
                Optional<Term> idTerm = Optional.ofNullable(ROW_TYPE_TO_ID.getOrDefault(s.rowType, null));
                if (idTerm.isPresent() && s.hasTerm(idTerm.get())) {
                    // create another id field with the same index
                    Schema.Field id = new Schema.Field(DwcaTerm.ID, s.field(idTerm.get()).index);
                    List<Schema.Field> columns = Lists.newArrayList(s.columns);
                    columns.add(id);
                    Schema s2 = new Schema(s.file, s.rowType, s.encoding, s.settings, columns);
                    updateSchema(s2);
                }
            }
        }
        // select core
        if (size() == 1) {
            coreRowType = schemas.keySet().iterator().next();
        } else {
            for (Term t : PREFERRED_CORE_TYPES) {
                if (hasData(t)) {
                    coreRowType = t;
                    LOG.warn("{} data files found but no archive descriptor. Using {}", size(), coreRowType);
                    break;
                }
            }
            if (coreRowType == null) {
                // rather abort instead of picking randomly
                throw new NormalizationFailedException.SourceInvalidException("Multiple unknown schemas found: " + Joiner.on(", ").join(schemas.keySet()));
            }
        }
    }
    CsvFormat format = coreSchema().settings.getFormat();
    LOG.info("Found {} core [delim={} quote={}] and {} extensions", coreRowType, format.getDelimiter(), format.getQuote(), size() - 1);
}
Also used : Path(java.nio.file.Path) Schema(life.catalogue.csv.Schema) CsvFormat(com.univocity.parsers.csv.CsvFormat) ColdpTerm(life.catalogue.coldp.ColdpTerm) DwcUnofficialTerm(life.catalogue.coldp.DwcUnofficialTerm)

Example 4 with Schema

use of life.catalogue.csv.Schema in project backend by CatalogueOfLife.

the class ColdpReader method validate.

protected void validate() throws NormalizationFailedException.SourceInvalidException {
    super.validate();
    // allow only COL row types
    filterSchemas(rowType -> rowType instanceof ColdpTerm);
    // Fail early, if missing ignore file alltogether!!!
    for (ColdpTerm t : ID_SCHEMAS) {
        require(t, ColdpTerm.ID);
    }
    for (ColdpTerm t : NAMEID_SCHEMAS) {
        require(t, ColdpTerm.nameID);
    }
    Term nameRowType = requireOneSchema(ColdpTerm.Name, ColdpTerm.NameUsage);
    require(ColdpTerm.NameRelation, ColdpTerm.relatedNameID);
    require(ColdpTerm.NameRelation, ColdpTerm.type);
    require(ColdpTerm.TaxonConceptRelation, ColdpTerm.relatedTaxonID);
    require(ColdpTerm.TaxonConceptRelation, ColdpTerm.type);
    require(ColdpTerm.SpeciesInteraction, ColdpTerm.type);
    // either require the scientificName or at least some parsed field
    if (!hasData(nameRowType, ColdpTerm.scientificName)) {
        LOG.warn("No scientificName mapped! Require parsed name fields");
        // genus & specificEpithet must exist otherwise!
        if (nameRowType.equals(ColdpTerm.NameUsage)) {
            requireOne(nameRowType, ColdpTerm.uninomial, ColdpTerm.genericName);
        } else {
            requireOne(nameRowType, ColdpTerm.uninomial, ColdpTerm.genus);
        }
    }
    // reference dependencies
    if (!hasReferences()) {
        LOG.warn("No Reference mapped! Disallow all referenceIDs");
        disallow(ColdpTerm.NameUsage, ColdpTerm.nameReferenceID);
        for (ColdpTerm rt : REFID_SCHEMAS) {
            disallow(rt, ColdpTerm.referenceID);
        }
    }
    Optional<Schema> taxonOpt = schema(ColdpTerm.Taxon).or(() -> schema(ColdpTerm.NameUsage));
    if (taxonOpt.isPresent()) {
        Schema taxon = taxonOpt.get();
        if (taxon.hasTerm(ColdpTerm.parentID)) {
            mappingFlags.setParentNameMapped(true);
        } else {
            mappingFlags.setParentNameMapped(false);
            LOG.warn("No taxon parentID mapped");
        }
        if (taxon.hasAnyTerm(ColdpTerm.DENORMALIZED_RANKS)) {
            mappingFlags.setDenormedClassificationMapped(true);
            LOG.info("Use denormalized taxon classification");
        } else {
            mappingFlags.setDenormedClassificationMapped(false);
        }
        for (ColdpTerm t : TAXID_SCHEMAS) {
            require(t, ColdpTerm.taxonID);
        }
        requireOne(ColdpTerm.Distribution, ColdpTerm.area, ColdpTerm.areaID);
        require(ColdpTerm.VernacularName, ColdpTerm.name);
        require(ColdpTerm.Media, ColdpTerm.url);
        if (hasSchema(ColdpTerm.Synonym)) {
            mappingFlags.setAcceptedNameMapped(true);
        } else {
            LOG.warn("No Synonyms mapped!");
        }
    } else {
        LOG.warn("No Taxa mapped, only inserting names!");
        for (ColdpTerm t : TAXID_SCHEMAS) {
            schemas.remove(t);
        }
    }
    reportMissingSchemas(ColdpTerm.class);
}
Also used : Schema(life.catalogue.csv.Schema) ColdpTerm(life.catalogue.coldp.ColdpTerm) ColdpTerm(life.catalogue.coldp.ColdpTerm) Term(org.gbif.dwc.terms.Term)

Aggregations

ColdpTerm (life.catalogue.coldp.ColdpTerm)4 Schema (life.catalogue.csv.Schema)4 DwcUnofficialTerm (life.catalogue.coldp.DwcUnofficialTerm)3 Path (java.nio.file.Path)2 CsvFormat (com.univocity.parsers.csv.CsvFormat)1 CsvParserSettings (com.univocity.parsers.csv.CsvParserSettings)1 Charset (java.nio.charset.Charset)1 CharsetDetectingStream (life.catalogue.common.io.CharsetDetectingStream)1 Term (org.gbif.dwc.terms.Term)1