use of life.catalogue.csv.Schema in project backend by CatalogueOfLife.
the class DwcaReader method buildSchema.
private void buildSchema(XMLStreamReader2 parser, boolean core) throws XMLStreamException, IOException {
// rowType
final Term rowType = VocabularyUtils.TF.findClassTerm(attr(parser, "rowType"));
if (core) {
coreRowType = rowType;
}
// encoding
String enc = attr(parser, "encoding");
// delimiter
final CsvParserSettings set = CSV.clone();
String val = unescapeBackslash(attr(parser, "fieldsTerminatedBy"));
set.setDelimiterDetectionEnabled(true);
if (val != null) {
if (val.length() != 1) {
throw new IllegalArgumentException("fieldsTerminatedBy needs to be a single char");
} else {
set.setDelimiterDetectionEnabled(false);
set.getFormat().setDelimiter(val.charAt(0));
LOG.debug("Use delimiter {} for {}", StringEscapeUtils.escapeJava(val), rowType);
}
}
val = unescapeBackslash(attr(parser, "fieldsEnclosedBy"));
set.setQuoteDetectionEnabled(false);
if (val == null) {
val = String.valueOf('\0');
}
if (val.length() != 1) {
throw new IllegalArgumentException("fieldsEnclosedBy needs to be a single char");
} else {
LOG.debug("Use quote char {} for {}", val, rowType);
set.getFormat().setQuote(val.charAt(0));
}
// we ignore linesTerminatedBy
// Its quite often wrong and people dont really use anything else than \n \r!
set.setLineSeparatorDetectionEnabled(true);
// setAttrIfExists(parser, "linesTerminatedBy", set.getFormat()::setLineSeparator);
val = attr(parser, "ignoreHeaderLines");
if (val != null) {
try {
set.setNumberOfRowsToSkip(Long.parseLong(val));
} catch (NumberFormatException e) {
throw new IllegalArgumentException("ignoreHeaderLines needs to be a valid integer");
}
}
// parse fields & file
Path file = resolve(attr(parser, "encoding"));
List<Schema.Field> fields = Lists.newArrayList();
int event;
boolean stop = false;
StringBuilder text = new StringBuilder();
while (!stop) {
event = parser.next();
stop = event == XMLStreamConstants.END_DOCUMENT;
switch(event) {
case XMLStreamConstants.START_ELEMENT:
text = new StringBuilder();
boolean id = false;
switch(parser.getLocalName()) {
case "id":
case "coreId":
case "coreid":
id = true;
case "field":
buildField(parser, id).ifPresent(fields::add);
break;
}
break;
case XMLStreamConstants.END_ELEMENT:
switch(parser.getLocalName()) {
case "location":
file = resolve(text.toString());
break;
case "core":
case "extension":
stop = true;
break;
}
break;
case XMLStreamConstants.CHARACTERS:
if (parser.hasText()) {
text.append(parser.getText().trim());
}
break;
}
}
// final encoding
Charset charset;
try {
charset = Charset.forName(enc);
} catch (IllegalArgumentException e) {
try (CharsetDetectingStream in = CharsetDetectingStream.create(Files.newInputStream(file))) {
charset = in.getCharset();
LOG.debug("Use encoding {} for file {}", charset, PathUtils.getFilename(file));
}
LOG.warn("Bad charset encoding {} specified, using {}", enc, charset);
}
Schema s = new Schema(file, rowType, charset, set, fields);
LOG.debug("Found schema {}", s);
schemas.put(rowType, s);
}
use of life.catalogue.csv.Schema in project backend by CatalogueOfLife.
the class DwcaReader method validate.
@Override
protected void validate() throws NormalizationFailedException.SourceInvalidException {
super.validate();
// no checks
if (coreRowType != DwcTerm.Taxon) {
throw new NormalizationFailedException.SourceInvalidException("No Taxon core, not a checklist?");
}
// check for a minimal parsed name
final Schema core = schema(DwcTerm.Taxon).get();
if ((core.hasTerm(DwcTerm.genus) || core.hasTerm(DwcTerm.genericName)) && core.hasTerm(DwcTerm.specificEpithet)) {
mappingFlags.setParsedNameMapped(true);
}
// make sure either scientificName or genus & specificEpithet are mapped
if (!core.hasTerm(DwcTerm.scientificName)) {
LOG.warn("No scientificName mapped");
if (!mappingFlags.isParsedNameMapped()) {
// no name to work with!!!
throw new NormalizationFailedException.SourceInvalidException("No scientificName nor parsed name mapped");
} else {
// warn if there is no author mapped for a parsed name
if (!core.hasTerm(DwcTerm.scientificNameAuthorship)) {
LOG.warn("No scientificNameAuthorship mapped for parsed name");
}
}
}
// warn if highly recommended terms are missing
if (!core.hasTerm(DwcTerm.taxonRank)) {
LOG.warn("No taxonRank mapped");
}
// check if taxonID should be used, not the generic ID
if (core.hasTerm(DwcTerm.taxonID) && (!core.hasTerm(DwcaTerm.ID) || !core.field(DwcaTerm.ID).index.equals(core.field(DwcTerm.taxonID).index))) {
LOG.info("Use taxonID instead of ID");
mappingFlags.setTaxonId(true);
}
// multi values in use, e.g. for acceptedID?
for (Schema.Field f : core.columns) {
if (!Strings.isNullOrEmpty(f.delimiter)) {
mappingFlags.getMultiValueDelimiters().put(f.term, Splitter.on(f.delimiter).omitEmptyStrings());
}
}
for (Term t : DwcTerm.HIGHER_RANKS) {
if (core.hasTerm(t)) {
mappingFlags.setDenormedClassificationMapped(true);
break;
}
}
if (core.hasTerm(AcefTerm.Superfamily)) {
mappingFlags.setDenormedClassificationMapped(true);
}
if (core.hasTerm(DwcTerm.parentNameUsageID) || core.hasTerm(DwcTerm.parentNameUsage)) {
mappingFlags.setParentNameMapped(true);
}
if (core.hasTerm(DwcTerm.acceptedNameUsageID) || core.hasTerm(DwcTerm.acceptedNameUsage)) {
mappingFlags.setAcceptedNameMapped(true);
} else {
if (core.hasTerm(AcefTerm.AcceptedTaxonID)) {
// this sometimes gets confused with dwc - translate into dwc as we read dwc archives here
// as schema and all fields are final we create a copyTaxon of the entire schema here
Schema.Field f = core.field(AcefTerm.AcceptedTaxonID);
Schema.Field f2 = new Schema.Field(DwcTerm.acceptedNameUsageID, f.value, f.index, f.delimiter);
List<Schema.Field> updatedColumns = Lists.newArrayList(core.columns);
updatedColumns.set(updatedColumns.indexOf(f), f2);
Schema s2 = new Schema(core.file, core.rowType, core.encoding, core.settings, updatedColumns);
putSchema(s2);
mappingFlags.setAcceptedNameMapped(true);
} else {
LOG.warn("No accepted name terms mapped");
}
}
if (core.hasTerm(DwcTerm.originalNameUsageID) || core.hasTerm(DwcTerm.originalNameUsage)) {
mappingFlags.setOriginalNameMapped(true);
}
// any classification?
if (!mappingFlags.isParentNameMapped() && !mappingFlags.isDenormedClassificationMapped()) {
LOG.warn("No higher classification mapped");
}
// metadata file present?
if (metadataFile != null && !Files.exists(metadataFile)) {
LOG.warn("Metadata file {} does not exist", metadataFile);
metadataFile = null;
}
// TODO: validate extensions:
// vernacular name: vernacularName
// distribution: some area (locationID, countryCode, etc)
}
use of life.catalogue.csv.Schema in project backend by CatalogueOfLife.
the class DwcaReader method discoverSchemas.
/**
* First tries to find and read a meta.xml file.
* If none is found all potential txt files are scanned.
*
* @param termPrefix optional preferred term namespace prefix to use when looking up class & property terms
* @throws IOException
*/
@Override
protected void discoverSchemas(String termPrefix) throws IOException {
Path eml = resolve(EML_FN);
if (Files.exists(eml)) {
metadataFile = eml;
}
Path meta = resolve(META_FN);
if (Files.exists(meta)) {
readFromMeta(meta);
} else {
super.discoverSchemas(termPrefix);
// add artificial id terms for known rowType id pairs
for (Schema s : schemas.values()) {
if (!s.hasTerm(DwcaTerm.ID)) {
Optional<Term> idTerm = Optional.ofNullable(ROW_TYPE_TO_ID.getOrDefault(s.rowType, null));
if (idTerm.isPresent() && s.hasTerm(idTerm.get())) {
// create another id field with the same index
Schema.Field id = new Schema.Field(DwcaTerm.ID, s.field(idTerm.get()).index);
List<Schema.Field> columns = Lists.newArrayList(s.columns);
columns.add(id);
Schema s2 = new Schema(s.file, s.rowType, s.encoding, s.settings, columns);
updateSchema(s2);
}
}
}
// select core
if (size() == 1) {
coreRowType = schemas.keySet().iterator().next();
} else {
for (Term t : PREFERRED_CORE_TYPES) {
if (hasData(t)) {
coreRowType = t;
LOG.warn("{} data files found but no archive descriptor. Using {}", size(), coreRowType);
break;
}
}
if (coreRowType == null) {
// rather abort instead of picking randomly
throw new NormalizationFailedException.SourceInvalidException("Multiple unknown schemas found: " + Joiner.on(", ").join(schemas.keySet()));
}
}
}
CsvFormat format = coreSchema().settings.getFormat();
LOG.info("Found {} core [delim={} quote={}] and {} extensions", coreRowType, format.getDelimiter(), format.getQuote(), size() - 1);
}
use of life.catalogue.csv.Schema in project backend by CatalogueOfLife.
the class ColdpReader method validate.
protected void validate() throws NormalizationFailedException.SourceInvalidException {
super.validate();
// allow only COL row types
filterSchemas(rowType -> rowType instanceof ColdpTerm);
// Fail early, if missing ignore file alltogether!!!
for (ColdpTerm t : ID_SCHEMAS) {
require(t, ColdpTerm.ID);
}
for (ColdpTerm t : NAMEID_SCHEMAS) {
require(t, ColdpTerm.nameID);
}
Term nameRowType = requireOneSchema(ColdpTerm.Name, ColdpTerm.NameUsage);
require(ColdpTerm.NameRelation, ColdpTerm.relatedNameID);
require(ColdpTerm.NameRelation, ColdpTerm.type);
require(ColdpTerm.TaxonConceptRelation, ColdpTerm.relatedTaxonID);
require(ColdpTerm.TaxonConceptRelation, ColdpTerm.type);
require(ColdpTerm.SpeciesInteraction, ColdpTerm.type);
// either require the scientificName or at least some parsed field
if (!hasData(nameRowType, ColdpTerm.scientificName)) {
LOG.warn("No scientificName mapped! Require parsed name fields");
// genus & specificEpithet must exist otherwise!
if (nameRowType.equals(ColdpTerm.NameUsage)) {
requireOne(nameRowType, ColdpTerm.uninomial, ColdpTerm.genericName);
} else {
requireOne(nameRowType, ColdpTerm.uninomial, ColdpTerm.genus);
}
}
// reference dependencies
if (!hasReferences()) {
LOG.warn("No Reference mapped! Disallow all referenceIDs");
disallow(ColdpTerm.NameUsage, ColdpTerm.nameReferenceID);
for (ColdpTerm rt : REFID_SCHEMAS) {
disallow(rt, ColdpTerm.referenceID);
}
}
Optional<Schema> taxonOpt = schema(ColdpTerm.Taxon).or(() -> schema(ColdpTerm.NameUsage));
if (taxonOpt.isPresent()) {
Schema taxon = taxonOpt.get();
if (taxon.hasTerm(ColdpTerm.parentID)) {
mappingFlags.setParentNameMapped(true);
} else {
mappingFlags.setParentNameMapped(false);
LOG.warn("No taxon parentID mapped");
}
if (taxon.hasAnyTerm(ColdpTerm.DENORMALIZED_RANKS)) {
mappingFlags.setDenormedClassificationMapped(true);
LOG.info("Use denormalized taxon classification");
} else {
mappingFlags.setDenormedClassificationMapped(false);
}
for (ColdpTerm t : TAXID_SCHEMAS) {
require(t, ColdpTerm.taxonID);
}
requireOne(ColdpTerm.Distribution, ColdpTerm.area, ColdpTerm.areaID);
require(ColdpTerm.VernacularName, ColdpTerm.name);
require(ColdpTerm.Media, ColdpTerm.url);
if (hasSchema(ColdpTerm.Synonym)) {
mappingFlags.setAcceptedNameMapped(true);
} else {
LOG.warn("No Synonyms mapped!");
}
} else {
LOG.warn("No Taxa mapped, only inserting names!");
for (ColdpTerm t : TAXID_SCHEMAS) {
schemas.remove(t);
}
}
reportMissingSchemas(ColdpTerm.class);
}
Aggregations