Search in sources :

Example 1 with LineIterator

use of org.apache.commons.io.LineIterator in project deeplearning4j by deeplearning4j.

the class Word2VecDataFetcher method next.

@Override
public DataSet next() {
    //pop from cache when possible, or when there's nothing left
    if (cache.size() >= batch || !files.hasNext())
        return fromCache();
    File f = files.next();
    try {
        LineIterator lines = FileUtils.lineIterator(f);
        INDArray outcomes = null;
        INDArray input = null;
        while (lines.hasNext()) {
            List<Window> windows = Windows.windows(lines.nextLine());
            if (windows.isEmpty() && lines.hasNext())
                continue;
            if (windows.size() < batch) {
                input = Nd4j.create(windows.size(), vec.lookupTable().layerSize() * vec.getWindow());
                outcomes = Nd4j.create(batch, labels.size());
                for (int i = 0; i < windows.size(); i++) {
                    input.putRow(i, WindowConverter.asExampleMatrix(cache.get(i), vec));
                    int idx = labels.indexOf(windows.get(i).getLabel());
                    if (idx < 0)
                        idx = 0;
                    INDArray outcomeRow = FeatureUtil.toOutcomeVector(idx, labels.size());
                    outcomes.putRow(i, outcomeRow);
                }
                return new DataSet(input, outcomes);
            } else {
                input = Nd4j.create(batch, vec.lookupTable().layerSize() * vec.getWindow());
                outcomes = Nd4j.create(batch, labels.size());
                for (int i = 0; i < batch; i++) {
                    input.putRow(i, WindowConverter.asExampleMatrix(cache.get(i), vec));
                    int idx = labels.indexOf(windows.get(i).getLabel());
                    if (idx < 0)
                        idx = 0;
                    INDArray outcomeRow = FeatureUtil.toOutcomeVector(idx, labels.size());
                    outcomes.putRow(i, outcomeRow);
                }
                /*
                     * Note that I'm aware of possible concerns for sentence sequencing.
                     * This is a hack right now in place of something
                     * that will be way more elegant in the future.
                     */
                if (windows.size() > batch) {
                    List<Window> leftOvers = windows.subList(batch, windows.size());
                    cache.addAll(leftOvers);
                }
                return new DataSet(input, outcomes);
            }
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return null;
}
Also used : Window(org.deeplearning4j.text.movingwindow.Window) INDArray(org.nd4j.linalg.api.ndarray.INDArray) DataSet(org.nd4j.linalg.dataset.DataSet) IOException(java.io.IOException) File(java.io.File) LineIterator(org.apache.commons.io.LineIterator)

Example 2 with LineIterator

use of org.apache.commons.io.LineIterator in project druid by druid-io.

the class AggregationTestHelper method createIndex.

public void createIndex(InputStream inputDataStream, String parserJson, String aggregators, File outDir, long minTimestamp, Granularity gran, int maxRowCount) throws Exception {
    try {
        StringInputRowParser parser = mapper.readValue(parserJson, StringInputRowParser.class);
        LineIterator iter = IOUtils.lineIterator(inputDataStream, "UTF-8");
        List<AggregatorFactory> aggregatorSpecs = mapper.readValue(aggregators, new TypeReference<List<AggregatorFactory>>() {
        });
        createIndex(iter, parser, aggregatorSpecs.toArray(new AggregatorFactory[0]), outDir, minTimestamp, gran, true, maxRowCount);
    } finally {
        Closeables.close(inputDataStream, true);
    }
}
Also used : StringInputRowParser(io.druid.data.input.impl.StringInputRowParser) List(java.util.List) ArrayList(java.util.ArrayList) LineIterator(org.apache.commons.io.LineIterator)

Example 3 with LineIterator

use of org.apache.commons.io.LineIterator in project deeplearning4j by deeplearning4j.

the class WordVectorSerializer method loadTxt.

/**
     * Loads an in memory cache from the given path (sets syn0 and the vocab)
     *
     * @param vectorsFile the path of the file to load
     * @return a Pair holding the lookup table and the vocab cache.
     * @throws FileNotFoundException if the input file does not exist
     */
public static Pair<InMemoryLookupTable, VocabCache> loadTxt(File vectorsFile) throws FileNotFoundException, UnsupportedEncodingException {
    BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(vectorsFile), "UTF-8"));
    AbstractCache cache = new AbstractCache<>();
    LineIterator iter = IOUtils.lineIterator(reader);
    String line = null;
    boolean hasHeader = false;
    if (iter.hasNext()) {
        // skip header line
        line = iter.nextLine();
        //look for spaces
        if (!line.contains(" ")) {
            log.debug("Skipping first line");
            hasHeader = true;
        } else {
            // we should check for something that looks like proper word vectors here. i.e: 1 word at the 0 position, and bunch of floats further
            String[] split = line.split(" ");
            try {
                long[] header = new long[split.length];
                for (int x = 0; x < split.length; x++) {
                    header[x] = Long.parseLong(split[x]);
                }
                if (split.length < 4)
                    hasHeader = true;
                // [2] - number of documents <-- DL4j-only value
                if (split.length == 3)
                    cache.incrementTotalDocCount(header[2]);
                printOutProjectedMemoryUse(header[0], (int) header[1], 1);
                hasHeader = true;
                try {
                    reader.close();
                } catch (Exception ex) {
                }
            } catch (Exception e) {
                // if any conversion exception hits - that'll be considered header
                hasHeader = false;
            }
        }
    }
    //reposition buffer to be one line ahead
    if (hasHeader) {
        line = "";
        iter.close();
        reader = new BufferedReader(new FileReader(vectorsFile));
        iter = IOUtils.lineIterator(reader);
        iter.nextLine();
    }
    List<INDArray> arrays = new ArrayList<>();
    while (iter.hasNext()) {
        if (line.isEmpty())
            line = iter.nextLine();
        String[] split = line.split(" ");
        //split[0].replaceAll(whitespaceReplacement, " ");
        String word = decodeB64(split[0]);
        VocabWord word1 = new VocabWord(1.0, word);
        word1.setIndex(cache.numWords());
        cache.addToken(word1);
        cache.addWordToIndex(word1.getIndex(), word);
        cache.putVocabWord(word);
        float[] vector = new float[split.length - 1];
        for (int i = 1; i < split.length; i++) {
            vector[i - 1] = Float.parseFloat(split[i]);
        }
        INDArray row = Nd4j.create(vector);
        arrays.add(row);
        // workaround for skipped first row
        line = "";
    }
    INDArray syn = Nd4j.vstack(arrays);
    InMemoryLookupTable lookupTable = (InMemoryLookupTable) new InMemoryLookupTable.Builder().vectorLength(arrays.get(0).columns()).useAdaGrad(false).cache(cache).useHierarchicSoftmax(false).build();
    if (Nd4j.ENFORCE_NUMERICAL_STABILITY)
        Nd4j.clearNans(syn);
    lookupTable.setSyn0(syn);
    iter.close();
    try {
        reader.close();
    } catch (Exception e) {
    }
    return new Pair<>(lookupTable, (VocabCache) cache);
}
Also used : ArrayList(java.util.ArrayList) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) LineIterator(org.apache.commons.io.LineIterator) BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) DL4JInvalidInputException(org.deeplearning4j.exception.DL4JInvalidInputException) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) InMemoryLookupTable(org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Pair(org.deeplearning4j.berkeley.Pair)

Example 4 with LineIterator

use of org.apache.commons.io.LineIterator in project deeplearning4j by deeplearning4j.

the class GloveWeightLookupTable method load.

/**
     * Load a glove model from an input stream.
     * The format is:
     * word num1 num2....
     * @param is the input stream to read from for the weights
     * @param vocab the vocab for the lookuptable
     * @return the loaded model
     * @throws java.io.IOException if one occurs
     */
public static GloveWeightLookupTable load(InputStream is, VocabCache<? extends SequenceElement> vocab) throws IOException {
    LineIterator iter = IOUtils.lineIterator(is, "UTF-8");
    GloveWeightLookupTable glove = null;
    Map<String, float[]> wordVectors = new HashMap<>();
    while (iter.hasNext()) {
        String line = iter.nextLine().trim();
        if (line.isEmpty())
            continue;
        String[] split = line.split(" ");
        String word = split[0];
        if (glove == null)
            glove = new GloveWeightLookupTable.Builder().cache(vocab).vectorLength(split.length - 1).build();
        if (word.isEmpty())
            continue;
        float[] read = read(split, glove.layerSize());
        if (read.length < 1)
            continue;
        wordVectors.put(word, read);
    }
    glove.setSyn0(weights(glove, wordVectors, vocab));
    glove.resetWeights(false);
    iter.close();
    return glove;
}
Also used : HashMap(java.util.HashMap) LineIterator(org.apache.commons.io.LineIterator)

Example 5 with LineIterator

use of org.apache.commons.io.LineIterator in project opennms by OpenNMS.

the class JettyConfigMigratorOffline method execute.

/* (non-Javadoc)
     * @see org.opennms.upgrade.api.OnmsUpgrade#execute()
     */
@Override
public void execute() throws OnmsUpgradeException {
    String jettySSL = getMainProperties().getProperty("org.opennms.netmgt.jetty.https-port", null);
    String jettyAJP = getMainProperties().getProperty("org.opennms.netmgt.jetty.ajp-port", null);
    boolean sslWasFixed = false;
    boolean ajpWasFixed = false;
    try {
        log("SSL Enabled ? %s\n", jettySSL != null);
        log("AJP Enabled ? %s\n", jettyAJP != null);
        if (jettySSL != null || jettyAJP != null) {
            File jettyXmlExample = new File(getHomeDirectory(), "etc" + File.separator + "examples" + File.separator + "jetty.xml");
            File jettyXml = new File(getHomeDirectory(), "etc" + File.separator + "jetty.xml");
            if (!jettyXml.exists() && !jettyXmlExample.exists()) {
                throw new FileNotFoundException("The required file doesn't exist: " + jettyXmlExample);
            }
            if (!jettyXml.exists()) {
                log("Copying %s into %s\n", jettyXmlExample, jettyXml);
                FileUtils.copyFile(jettyXmlExample, jettyXml);
            }
            log("Creating %s\n", jettyXml);
            File tempFile = new File(jettyXml.getAbsoluteFile() + ".tmp");
            FileWriter w = new FileWriter(tempFile);
            LineIterator it = FileUtils.lineIterator(jettyXmlExample);
            boolean startSsl = false;
            boolean startAjp = false;
            while (it.hasNext()) {
                String line = it.next();
                if (startAjp) {
                    if (line.matches("^\\s+[<][!]--\\s*$")) {
                        continue;
                    }
                    if (line.matches("^\\s+--[>]\\s*$")) {
                        startAjp = false;
                        ajpWasFixed = true;
                        continue;
                    }
                }
                if (startSsl) {
                    if (line.matches("^\\s+[<][!]--\\s*$")) {
                        continue;
                    }
                    if (line.matches("^\\s+--[>]\\s*$")) {
                        startSsl = false;
                        sslWasFixed = true;
                        continue;
                    }
                }
                w.write(line + "\n");
                if (startAjp == false && line.contains("<!-- Add AJP support -->") && jettyAJP != null) {
                    startAjp = true;
                    log("Enabling AjpConnector\n");
                }
                if (startSsl == false && line.contains("<!-- Add HTTPS support -->") && jettySSL != null) {
                    startSsl = true;
                    log("Enabling SslSelectChannelConnector\n");
                }
            }
            LineIterator.closeQuietly(it);
            w.close();
            FileUtils.copyFile(tempFile, jettyXml);
            FileUtils.deleteQuietly(tempFile);
        } else {
            log("Neither SSL nor AJP are enabled.\n");
        }
    } catch (Exception e) {
        throw new OnmsUpgradeException("Can't fix Jetty configuration because " + e.getMessage(), e);
    }
    if (jettyAJP != null && !ajpWasFixed) {
        throw new OnmsUpgradeException("Can't enable APJ, please manually edit jetty.xml and uncomment the section where org.eclipse.jetty.ajp.Ajp13SocketConnector is defined.");
    }
    if (jettySSL != null && !sslWasFixed) {
        throw new OnmsUpgradeException("Can't enable SSL, please manually edit jetty.xml and uncomment the section where org.eclipse.jetty.server.ssl.SslSelectChannelConnector is defined.");
    }
}
Also used : FileWriter(java.io.FileWriter) FileNotFoundException(java.io.FileNotFoundException) File(java.io.File) LineIterator(org.apache.commons.io.LineIterator) OnmsUpgradeException(org.opennms.upgrade.api.OnmsUpgradeException) FileNotFoundException(java.io.FileNotFoundException) OnmsUpgradeException(org.opennms.upgrade.api.OnmsUpgradeException)

Aggregations

LineIterator (org.apache.commons.io.LineIterator)42 IOException (java.io.IOException)24 File (java.io.File)13 InputStream (java.io.InputStream)12 ArrayList (java.util.ArrayList)9 HashMap (java.util.HashMap)8 StringReader (java.io.StringReader)7 FileIteratingFirehose (io.druid.data.input.impl.FileIteratingFirehose)5 BufferedReader (java.io.BufferedReader)5 InputStreamReader (java.io.InputStreamReader)5 Matcher (java.util.regex.Matcher)5 Pattern (java.util.regex.Pattern)5 UnexpectedServerException (com.pratilipi.common.exception.UnexpectedServerException)4 FileNotFoundException (java.io.FileNotFoundException)4 FileWriter (java.io.FileWriter)3 Reader (java.io.Reader)3 URISyntaxException (java.net.URISyntaxException)3 DataAccessor (com.pratilipi.data.DataAccessor)2 BufferedWriter (java.io.BufferedWriter)2 FileReader (java.io.FileReader)2