Search in sources :

Example 16 with LabeledCSVParser

use of com.Ostermiller.util.LabeledCSVParser in project eol-globi-data by jhpoelen.

the class StudyImporterForGoMexSI2 method addSpecimen.

private void addSpecimen(String datafile, String scientificNameLabel, ParseEventHandler specimenListener) throws StudyImporterException {
    try {
        LabeledCSVParser parser = parserFactory.createParser(datafile, CharsetConstant.UTF8);
        parseSpecimen(datafile, scientificNameLabel, specimenListener, parser);
    } catch (IOException e) {
        throw new StudyImporterException("failed to open resource [" + datafile + "]", e);
    }
}
Also used : LabeledCSVParser(com.Ostermiller.util.LabeledCSVParser) IOException(java.io.IOException)

Example 17 with LabeledCSVParser

use of com.Ostermiller.util.LabeledCSVParser in project eol-globi-data by jhpoelen.

the class ParserFactoryForDatasetTest method parserWithDatasetContextLocalResource.

@Test
public void parserWithDatasetContextLocalResource() throws IOException {
    ParserFactoryForDataset parserFactory = new ParserFactoryForDataset(new DatasetLocal());
    LabeledCSVParser parser = parserFactory.createParser("classpath:/org/eol/globi/data/someResource.csv", "UTF-8");
    assertThat(parser.getLine(), is(new String[] { "valueA", "valueB" }));
}
Also used : LabeledCSVParser(com.Ostermiller.util.LabeledCSVParser) DatasetLocal(org.eol.globi.service.DatasetLocal) Test(org.junit.Test)

Example 18 with LabeledCSVParser

use of com.Ostermiller.util.LabeledCSVParser in project eol-globi-data by jhpoelen.

the class StudyImporterForBioInfoTest method importTaxa.

@Test
public void importTaxa() throws IOException {
    String firstFewlines = "my taxon id,rank,latin,authority,english,NBN Code,family,order,phylum,url\n" + "\"268\",\"Informal\",\"'Chenopodiaceae'\",\"\",\"the old Chenopodiaceae\",\"\",\"Amaranthaceae\",\"Caryophyllales\",\"Tracheophyta\",\"www.bioinfo.org.uk/html/t268.htm\"\n" + "\"162827\",\"Species\",\"Abacarus hystrix\",\"(Nalepa, 1896)\",\"a mite\",\"NHMSYS0020190380\",\"Eriophyidae\",\"Trombidiformes\",\"Arthropoda\",\"www.bioinfo.org.uk/html/t162827.htm\"\n" + "\"41886\",\"Genus\",\"Abdera\",\"Stephens, 1832\",\"a genus of false darkling beetles\",\"NHMSYS0020151134\",\"Melandryidae\",\"Coleoptera\",\"Arthropoda\",\"www.bioinfo.org.uk/html/t41886.htm\"\n" + "\"34737\",\"Species\",\"Abdera biflexuosa\",\"(Curtis, 1829)\",\"a false darkling beetle\",\"NBNSYS0000024889\",\"Melandryidae\",\"Coleoptera\",\"Arthropoda\",\"www.bioinfo.org.uk/html/t34737.htm\"\n" + "\"34738\",\"Species\",\"Abdera flexuosa\",\"(Paykull, 1799)\",\"a false darkling beetle\",\"NBNSYS0000024890\",\"Melandryidae\",\"Coleoptera\",\"Arthropoda\",\"www.bioinfo.org.uk/html/t34738.htm\"\n" + "\"34739\",\"Species\",\"Abdera quadrifasciata\",\"(Curtis, 1829)\",\"a false darkling beetle\",\"NBNSYS0000024891\",\"Melandryidae\",\"Coleoptera\",\"Arthropoda\",\"www.bioinfo.org.uk/html/t34739.htm\"\n" + "\"34740\",\"Species\",\"Abdera triguttata\",\"(Gyllenhal, 1810)\",\"a false darkling beetle\",\"NBNSYS0000024892\",\"Melandryidae\",\"Coleoptera\",\"Arthropoda\",\"www.bioinfo.org.uk/html/t34740.htm\"\n" + "\"102829\",\"Species\",\"Abia sericea\",\"(Linnaeus, 1767)\",\"a clubhorned sawfly\",\"NHMSYS0020480647\",\"Cimbicidae\",\"Hymenoptera\",\"Arthropoda\",\"www.bioinfo.org.uk/html/t102829.htm\"\n" + "\"43913\",\"Genus\",\"Abies\",\"Mill.\",\"firs\",\"NHMSYS0000455511\",\"Pinaceae\",\"Pinales\",\"Tracheophyta\",\"www.bioinfo.org.uk/html/t43913.htm\"\n";
    final LabeledCSVParser parser = createParser(firstFewlines);
    Map<String, Taxon> taxonMap = StudyImporterForBioInfo.buildTaxonMap(parser);
    assertThat(taxonMap.get("268").getName(), is("Chenopodiaceae"));
    assertThat(taxonMap.get("41886"), is(nullValue()));
}
Also used : Taxon(org.eol.globi.domain.Taxon) JUnitMatchers.containsString(org.junit.matchers.JUnitMatchers.containsString) LabeledCSVParser(com.Ostermiller.util.LabeledCSVParser) Test(org.junit.Test)

Example 19 with LabeledCSVParser

use of com.Ostermiller.util.LabeledCSVParser in project eol-globi-data by jhpoelen.

the class TaxonCacheService method initTaxonIdMap.

private void initTaxonIdMap() throws PropertyEnricherException {
    try {
        LOG.info("taxon lookup service instantiating...");
        File luceneDir = new File(getCacheDir().getAbsolutePath(), "lucene");
        boolean preexisting = luceneDir.exists();
        createCacheDir(luceneDir, isTemporary());
        TaxonLookupServiceImpl taxonLookupService = new TaxonLookupServiceImpl(new SimpleFSDirectory(luceneDir));
        taxonLookupService.setMaxHits(getMaxTaxonLinks());
        taxonLookupService.start();
        if (!isTemporary() && preexisting) {
            LOG.info("pre-existing taxon lookup index found, no need to re-index...");
        } else {
            LOG.info("no pre-existing taxon lookup index found, re-indexing...");
            int count = 0;
            LOG.info("taxon map loading [" + taxonMapResource + "] ...");
            StopWatch watch = new StopWatch();
            watch.start();
            BufferedReader reader = createBufferedReader(taxonMapResource);
            final LabeledCSVParser labeledCSVParser = CSVTSVUtil.createLabeledTSVParser(reader);
            while (labeledCSVParser.getLine() != null) {
                Taxon provided = TaxonMapParser.parseProvidedTaxon(labeledCSVParser);
                Taxon resolved = TaxonMapParser.parseResolvedTaxon(labeledCSVParser);
                addIfNeeded(taxonLookupService, provided.getExternalId(), resolved.getExternalId());
                addIfNeeded(taxonLookupService, provided.getName(), resolved.getExternalId());
                addIfNeeded(taxonLookupService, resolved.getName(), resolved.getExternalId());
                count++;
            }
            watch.stop();
            logCacheLoadStats(watch.getTime(), count);
            LOG.info("taxon map loading [" + taxonMapResource + "] done.");
        }
        taxonLookupService.finish();
        this.taxonLookupService = taxonLookupService;
        LOG.info("taxon lookup service instantiating done.");
    } catch (IOException e) {
        throw new PropertyEnricherException("problem initiating taxon cache index", e);
    }
}
Also used : PropertyEnricherException(org.eol.globi.service.PropertyEnricherException) Taxon(org.eol.globi.domain.Taxon) BufferedReader(java.io.BufferedReader) LabeledCSVParser(com.Ostermiller.util.LabeledCSVParser) IOException(java.io.IOException) File(java.io.File) SimpleFSDirectory(org.apache.lucene.store.SimpleFSDirectory) StopWatch(org.apache.commons.lang3.time.StopWatch)

Example 20 with LabeledCSVParser

use of com.Ostermiller.util.LabeledCSVParser in project eol-globi-data by jhpoelen.

the class TaxonCacheService method taxonCacheIterator.

public static Iterator<Fun.Tuple2<String, Map<String, String>>> taxonCacheIterator(final String resource, final LineSkipper skipper) throws IOException {
    return new Iterator<Fun.Tuple2<String, Map<String, String>>>() {

        private BufferedReader reader = createBufferedReader(resource);

        private final LabeledCSVParser labeledCSVParser = CSVTSVUtil.createLabeledTSVParser(reader);

        private AtomicBoolean lineReady = new AtomicBoolean(false);

        @Override
        public boolean hasNext() {
            try {
                boolean hasNext;
                do {
                    hasNext = lineReady.get() || consumeLine(labeledCSVParser);
                } while (hasNext && skipper.shouldSkipLine(labeledCSVParser));
                return hasNext;
            } catch (IOException e) {
                LOG.error("failed to get next line", e);
                return false;
            }
        }

        private boolean consumeLine(LabeledCSVParser labeledCSVParser) throws IOException {
            boolean hasNext = labeledCSVParser.getLine() != null;
            if (skipper.shouldSkipLine(labeledCSVParser)) {
                lineReady.set(false);
            } else {
                lineReady.set(hasNext);
            }
            return hasNext;
        }

        @Override
        public Fun.Tuple2<String, Map<String, String>> next() {
            final Taxon taxon = TaxonCacheParser.parseLine(labeledCSVParser);
            lineReady.set(false);
            return new Fun.Tuple2<>(valueOrNoMatch(taxon.getExternalId()), TaxonUtil.taxonToMap(taxon));
        }

        public void remove() {
            throw new UnsupportedOperationException("remove");
        }
    };
}
Also used : AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) Taxon(org.eol.globi.domain.Taxon) Iterator(java.util.Iterator) BufferedReader(java.io.BufferedReader) LabeledCSVParser(com.Ostermiller.util.LabeledCSVParser) IOException(java.io.IOException) Map(java.util.Map) BTreeMap(org.mapdb.BTreeMap) Fun(org.mapdb.Fun)

Aggregations

LabeledCSVParser (com.Ostermiller.util.LabeledCSVParser)82 IOException (java.io.IOException)40 Test (org.junit.Test)31 Study (org.eol.globi.domain.Study)24 StudyImpl (org.eol.globi.domain.StudyImpl)17 Specimen (org.eol.globi.domain.Specimen)15 HashMap (java.util.HashMap)13 ArrayList (java.util.ArrayList)12 Location (org.eol.globi.domain.Location)12 TaxonImpl (org.eol.globi.domain.TaxonImpl)12 CSVParser (com.Ostermiller.util.CSVParser)10 StringReader (java.io.StringReader)8 LocationImpl (org.eol.globi.domain.LocationImpl)8 Taxon (org.eol.globi.domain.Taxon)8 InteractType (org.eol.globi.domain.InteractType)7 File (java.io.File)6 FileInputStream (java.io.FileInputStream)6 InputStream (java.io.InputStream)6 Date (java.util.Date)6 List (java.util.List)6