Search in sources :

Example 11 with LineIterator

use of org.apache.commons.io.LineIterator in project sling by apache.

the class SimpleDistributionQueueProvider method enableQueueProcessing.

public void enableQueueProcessing(@Nonnull DistributionQueueProcessor queueProcessor, String... queueNames) {
    if (checkpoint) {
        // recover from checkpoints
        log.debug("recovering from checkpoints if needed");
        for (final String queueName : queueNames) {
            log.debug("recovering for queue {}", queueName);
            DistributionQueue queue = getQueue(queueName);
            FilenameFilter filenameFilter = new FilenameFilter() {

                @Override
                public boolean accept(File file, String name) {
                    return name.equals(queueName + "-checkpoint");
                }
            };
            for (File qf : checkpointDirectory.listFiles(filenameFilter)) {
                log.info("recovering from checkpoint {}", qf);
                try {
                    LineIterator lineIterator = IOUtils.lineIterator(new FileReader(qf));
                    while (lineIterator.hasNext()) {
                        String s = lineIterator.nextLine();
                        String[] split = s.split(" ");
                        String id = split[0];
                        String infoString = split[1];
                        Map<String, Object> info = new HashMap<String, Object>();
                        JsonReader reader = Json.createReader(new StringReader(infoString));
                        JsonObject jsonObject = reader.readObject();
                        for (Map.Entry<String, JsonValue> entry : jsonObject.entrySet()) {
                            if (entry.getValue().getValueType().equals(JsonValue.ValueType.ARRAY)) {
                                JsonArray value = jsonObject.getJsonArray(entry.getKey());
                                String[] a = new String[value.size()];
                                for (int i = 0; i < a.length; i++) {
                                    a[i] = value.getString(i);
                                }
                                info.put(entry.getKey(), a);
                            } else if (JsonValue.NULL.equals(entry.getValue())) {
                                info.put(entry.getKey(), null);
                            } else {
                                info.put(entry.getKey(), ((JsonString) entry.getValue()).getString());
                            }
                        }
                        queue.add(new DistributionQueueItem(id, info));
                    }
                    log.info("recovered {} items from queue {}", queue.getStatus().getItemsCount(), queueName);
                } catch (FileNotFoundException e) {
                    log.warn("could not read checkpoint file {}", qf.getAbsolutePath());
                } catch (JsonException e) {
                    log.warn("could not parse info from checkpoint file {}", qf.getAbsolutePath());
                }
            }
        }
        // enable checkpointing
        for (String queueName : queueNames) {
            ScheduleOptions options = scheduler.NOW(-1, 15).canRunConcurrently(false).name(getJobName(queueName + "-checkpoint"));
            scheduler.schedule(new SimpleDistributionQueueCheckpoint(getQueue(queueName), checkpointDirectory), options);
        }
    }
    // enable processing
    for (String queueName : queueNames) {
        ScheduleOptions options = scheduler.NOW(-1, 1).canRunConcurrently(false).name(getJobName(queueName));
        scheduler.schedule(new SimpleDistributionQueueProcessor(getQueue(queueName), queueProcessor), options);
    }
}
Also used : JsonException(javax.json.JsonException) HashMap(java.util.HashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) FileNotFoundException(java.io.FileNotFoundException) JsonObject(javax.json.JsonObject) JsonString(javax.json.JsonString) LineIterator(org.apache.commons.io.LineIterator) FilenameFilter(java.io.FilenameFilter) StringReader(java.io.StringReader) JsonReader(javax.json.JsonReader) FileReader(java.io.FileReader) DistributionQueue(org.apache.sling.distribution.queue.DistributionQueue) JsonValue(javax.json.JsonValue) DistributionQueueItem(org.apache.sling.distribution.queue.DistributionQueueItem) JsonArray(javax.json.JsonArray) ScheduleOptions(org.apache.sling.commons.scheduler.ScheduleOptions) JsonObject(javax.json.JsonObject) JsonString(javax.json.JsonString) File(java.io.File) HashMap(java.util.HashMap) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap)

Example 12 with LineIterator

use of org.apache.commons.io.LineIterator in project stanbol by apache.

the class TikaEngineTest method assertContentRegexp.

/**
     * Tests if the parsed regex pattern are contained in any line of the parsed
     * test
     * @throws IOException 
     */
public void assertContentRegexp(Blob blob, String... regexp) throws IOException {
    Charset charset;
    if (blob.getParameter().containsKey("charset")) {
        charset = Charset.forName(blob.getParameter().get("charset"));
    } else {
        charset = Charset.defaultCharset();
    }
    Reader reader = null;
    nextPattern: for (String expr : regexp) {
        if (reader != null) {
            closeQuietly(reader);
        }
        final Pattern p = Pattern.compile(".*" + expr + ".*");
        reader = new InputStreamReader(blob.getStream(), charset);
        final LineIterator it = new LineIterator(reader);
        while (it.hasNext()) {
            final String line = it.nextLine();
            if (p.matcher(line).matches()) {
                continue nextPattern;
            }
        }
        fail(this + ": no match for regexp '" + expr + "', content=\n" + IOUtils.toString(blob.getStream(), charset.toString()));
    }
}
Also used : Pattern(java.util.regex.Pattern) InputStreamReader(java.io.InputStreamReader) Charset(java.nio.charset.Charset) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) LineIterator(org.apache.commons.io.LineIterator)

Example 13 with LineIterator

use of org.apache.commons.io.LineIterator in project stanbol by apache.

the class BenchmarkServlet method getExampleBenchmarkPaths.

private List<String> getExampleBenchmarkPaths(HttpServletRequest request) throws IOException {
    // TODO how to enumerate bundle resources?
    final String list = getBenchmarkText("/LIST.txt");
    final LineIterator it = new LineIterator(new StringReader(list));
    final List<String> result = new LinkedList<String>();
    while (it.hasNext()) {
        result.add(getExampleBenchmarkPath(request, it.nextLine()));
    }
    return result;
}
Also used : StringReader(java.io.StringReader) LineIterator(org.apache.commons.io.LineIterator) LinkedList(java.util.LinkedList)

Example 14 with LineIterator

use of org.apache.commons.io.LineIterator in project stanbol by apache.

the class GeonamesIndexingSource method entityDataIterator.

@Override
public EntityDataIterator entityDataIterator() {
    if (!consumed) {
        consumed = true;
    } else {
        throw new IllegalStateException("This implementation supports only a" + "single Iteration of the data.");
    }
    return new EntityDataIterator() {

        Iterator<RDFTerm> resources = resourceList.iterator();

        RDFTerm r;

        LineIterator it = null;

        private String next;

        private Representation rep;

        private String getNext() {
            while ((it == null || !it.hasNext()) && resources != null && resources.hasNext()) {
                if (r != null) {
                    IOUtils.closeQuietly(r.is);
                }
                r = resources.next();
                try {
                    it = r.getEntries();
                } catch (IOException e) {
                    log.error("Unable to read RDFTerm '" + r.getName() + "' because of " + e.getMessage(), e);
                    e.printStackTrace();
                    IOUtils.closeQuietly(r.is);
                    it = null;
                }
                resources.remove();
            }
            if (it != null && it.hasNext()) {
                return it.nextLine();
            } else {
                return null;
            }
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }

        @Override
        public String next() {
            if (next == null) {
                next = getNext();
            }
            if (next == null) {
                throw new NoSuchElementException();
            } else {
                rep = processGeonameEntry(next);
                next = null;
                return rep.getId();
            }
        }

        @Override
        public boolean hasNext() {
            if (next == null) {
                next = getNext();
            }
            return next != null;
        }

        @Override
        public Representation getRepresentation() {
            return rep;
        }

        @Override
        public void close() {
            if (r != null) {
                IOUtils.closeQuietly(r.is);
            }
            next = null;
            it = null;
            resources = null;
        }

        /**
             * Parses the Representation from the current line.<p>
             * NOTE: this does not process alternate labels and also does not
             * lookup entities for parent codes. Those things are done now by
             * own EntityProcessors
             * @param line the line to process
             * @return the representation
             */
        private Representation processGeonameEntry(String line) {
            LineTokenizer t = new LineTokenizer(line);
            //[0] geonames id
            String id = t.next();
            Integer geoNamesId = Integer.parseInt(id);
            //create a new Doc based on the first Element (geonamesID)
            Representation doc = valueFactory.createRepresentation(new StringBuilder(GEONAMES_RESOURCE_NS).append(id).append('/').toString());
            //add the Integer id so that we do not need to parse it from the subject URI
            doc.add(GeonamesPropertyEnum.idx_id.toString(), geoNamesId);
            //add the geonames:Feature type
            doc.add(GeonamesPropertyEnum.rdf_type.toString(), getReference(GeonamesPropertyEnum.gn_Feature.toString()));
            //[1] UTF-8 name
            String utf8Label = t.next();
            //[2] ASKII Name as rdfs:label
            String askiiLabel = t.next();
            if (utf8Label == null) {
                //use ASKII label as fallback for the utf8 version
                utf8Label = askiiLabel;
            }
            doc.addNaturalText(GeonamesPropertyEnum.gn_name.toString(), utf8Label);
            //[3] Alternate Names
            //alternate names are added later during processing
            t.next();
            //addAlternateNames(geoNamesId, doc);
            //[4] lat
            doc.add(GeonamesPropertyEnum.geo_lat.toString(), new BigDecimal(t.next()));
            //[5] lon
            doc.add(GeonamesPropertyEnum.geo_long.toString(), new BigDecimal(t.next()));
            //[6] featureClass
            String featureClass = new StringBuilder(GEONAMES_ONTOLOGY_NS).append(t.next()).toString();
            doc.add(GeonamesPropertyEnum.gn_featureClass.toString(), getReference(featureClass));
            //[7] featureCode (-> need to use <featureClass>.<featureCode>!!)
            doc.add(GeonamesPropertyEnum.gn_featureCode.toString(), getReference(new StringBuilder(featureClass).append('.').append(t.next()).toString()));
            //countryCode
            //  -> geonames uses here the link to an HTML Page showing the Country
            //     We would like to use an Link to a SKOS:Concept representing the Country
            // ... But luckily here we need only to add the URI!
            Set<String> ccs = new HashSet<String>();
            //[8] countryCode
            String countryCode = t.next();
            if (countryCode != null) {
                //need to trim because some country codes use '  ' to indicate null!
                countryCode = countryCode.trim();
                if (countryCode.length() == 2) {
                    //Yes there are some features that are in no country!
                    ccs.add(countryCode);
                }
            }
            //[9] alternate countryCodes
            String altCc = t.next();
            if (altCc != null) {
                StringTokenizer altCcT = new StringTokenizer(altCc, ",");
                while (altCcT.hasMoreElements()) {
                    countryCode = altCcT.nextToken();
                    if (countryCode.length() == 2) {
                        ccs.add(countryCode);
                    }
                }
            }
            if (!ccs.isEmpty()) {
                doc.add(GeonamesPropertyEnum.gn_countryCode.toString(), ccs);
            }
            //[10 TO 13] Admin codes
            //first read them -> we need to consume the tokens anyway
            String[] adminCodes = new String[] { //country
            countryCode, //ADM1
            t.next(), //ADM2
            t.next(), //ADM3
            t.next(), //ADM4
            t.next() };
            //Workaround for Admin1 -> add leading '0' for single Value
            if (adminCodes[1] != null && adminCodes[1].length() < 2) {
                adminCodes[1] = '0' + adminCodes[1];
            }
            //now process the admin Codes (including the country at index 0)
            StringBuilder parentCode = new StringBuilder();
            //iterate over parent codes until the first NULL (or '00' unknown) element
            for (int i = 0; i < adminCodes.length && adminCodes[i] != null && !adminCodes[i].equals("00"); i++) {
                if (i > 0) {
                    parentCode.append('.');
                }
                //add the current (last) Element
                parentCode.append(adminCodes[i]);
                String property = i == 0 ? GeonamesPropertyEnum.idx_CC.toString() : new StringBuilder(GeonamesPropertyEnum.idx_ADM.toString()).append(i).toString();
                // add each level
                doc.add(property, parentCode.toString());
            }
            //[14] population
            String populationString = t.next();
            if (populationString != null) {
                //NOTE: we need to used Long, because of Asia (3.800.000)
                Long population = new Long(populationString);
                if (population.intValue() > 0) {
                    doc.add(GeonamesPropertyEnum.gn_population.toString(), population);
                }
            }
            //[15 TO 16] elevation and gtopo30
            String altString = t.next();
            if (altString == null) {
                //if no elevation than use the gtopo30
                altString = t.next();
            } else {
                //if there is already en elevation, than consume these entry
                t.next();
            }
            Integer alt = Integer.valueOf(altString);
            if (alt.intValue() > -9999) {
                //it looks like that -9999 is sometimes used as not known!
                doc.add(GeonamesPropertyEnum.geo_alt.toString(), alt);
            }
            //[17] time zone
            //not used
            t.next();
            //[18] mod-date
            String modDateString = t.next();
            if (modDateString != null) {
                try {
                    doc.add(GeonamesPropertyEnum.dc_date.toString(), TimeUtils.toDate(DataTypeEnum.DateTime, modDateString));
                } catch (IllegalArgumentException e) {
                    log.warn(String.format("Unable to parse modificationDate for geonamesID %s from value %s", doc.getId(), modDateString));
                }
            }
            //doc.add(GeonamesPropertyEnum.dc_creator.toString(),"http://www.geonames.org/");
            return doc;
        }
    };
}
Also used : Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) IOException(java.io.IOException) EntityDataIterator(org.apache.stanbol.entityhub.indexing.core.EntityDataIterator) LineIterator(org.apache.commons.io.LineIterator) BigDecimal(java.math.BigDecimal) StringTokenizer(java.util.StringTokenizer) LineIterator(org.apache.commons.io.LineIterator) Iterator(java.util.Iterator) EntityDataIterator(org.apache.stanbol.entityhub.indexing.core.EntityDataIterator) NoSuchElementException(java.util.NoSuchElementException) HashSet(java.util.HashSet)

Example 15 with LineIterator

use of org.apache.commons.io.LineIterator in project stanbol by apache.

the class StanbolResourceLoader method getLines.

public List<String> getLines(String resource) throws IOException {
    List<String> lines = new ArrayList<String>();
    LineIterator it = IOUtils.lineIterator(openResource(resource), "UTF-8");
    while (it.hasNext()) {
        String line = it.nextLine();
        if (line != null && !line.isEmpty() && line.charAt(0) != '#') {
            lines.add(line);
        }
    }
    return lines;
}
Also used : ArrayList(java.util.ArrayList) LineIterator(org.apache.commons.io.LineIterator)

Aggregations

LineIterator (org.apache.commons.io.LineIterator)42 IOException (java.io.IOException)24 File (java.io.File)13 InputStream (java.io.InputStream)12 ArrayList (java.util.ArrayList)9 HashMap (java.util.HashMap)8 StringReader (java.io.StringReader)7 FileIteratingFirehose (io.druid.data.input.impl.FileIteratingFirehose)5 BufferedReader (java.io.BufferedReader)5 InputStreamReader (java.io.InputStreamReader)5 Matcher (java.util.regex.Matcher)5 Pattern (java.util.regex.Pattern)5 UnexpectedServerException (com.pratilipi.common.exception.UnexpectedServerException)4 FileNotFoundException (java.io.FileNotFoundException)4 FileWriter (java.io.FileWriter)3 Reader (java.io.Reader)3 URISyntaxException (java.net.URISyntaxException)3 DataAccessor (com.pratilipi.data.DataAccessor)2 BufferedWriter (java.io.BufferedWriter)2 FileReader (java.io.FileReader)2