Search in sources :

Example 6 with InFile

use of edu.illinois.cs.cogcomp.ner.IO.InFile in project cogcomp-nlp by CogComp.

the class BrownClusters method init.

/**
     * Initialze the brown cluster data. This is a singleton, so this process is sychronized and
     * atomic with resprect to the <code>get()</code> method above.
     * @param pathsToClusterFiles the files containing the data.
     * @param thresholds
     * @param isLowercaseBrownClusters
     */
public static void init(Vector<String> pathsToClusterFiles, Vector<Integer> thresholds, Vector<Boolean> isLowercaseBrownClusters) {
    try {
        Datastore dsNoCredentials = new Datastore(new ResourceConfigurator().getDefaultConfig());
        File gazDirectory = dsNoCredentials.getDirectory("org.cogcomp.brown-clusters", "brown-clusters", 1.5, false);
        synchronized (INIT_SYNC) {
            brownclusters = new BrownClusters();
            brownclusters.isLowercaseBrownClustersByResource = new boolean[isLowercaseBrownClusters.size()];
            brownclusters.wordToPathByResource = new ArrayList<>();
            brownclusters.resources = new ArrayList<>();
            for (int i = 0; i < pathsToClusterFiles.size(); i++) {
                THashMap<String, String> h = new THashMap<>();
                // We used to access the files as resources. Now we are accessing them programmatically.
                // InFile in = new InFile(ResourceUtilities.loadResource(pathsToClusterFiles.elementAt(i)));
                InputStream is = new FileInputStream(gazDirectory.getPath() + File.separator + pathsToClusterFiles.elementAt(i));
                InFile in = new InFile(is);
                String line = in.readLine();
                int wordsAdded = 0;
                while (line != null) {
                    StringTokenizer st = new StringTokenizer(line);
                    String path = st.nextToken();
                    String word = st.nextToken();
                    int occ = Integer.parseInt(st.nextToken());
                    if (occ >= thresholds.elementAt(i)) {
                        h.put(word, path);
                        wordsAdded++;
                    }
                    line = in.readLine();
                }
                if (ParametersForLbjCode.currentParameters.debug) {
                    logger.info(wordsAdded + " words added");
                }
                brownclusters.wordToPathByResource.add(h);
                brownclusters.isLowercaseBrownClustersByResource[i] = isLowercaseBrownClusters.elementAt(i);
                brownclusters.resources.add(pathsToClusterFiles.elementAt(i));
                in.close();
            }
        }
    } catch (InvalidPortException | InvalidEndpointException | DatastoreException | FileNotFoundException e) {
        e.printStackTrace();
    }
}
Also used : InFile(edu.illinois.cs.cogcomp.ner.IO.InFile) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) FileNotFoundException(java.io.FileNotFoundException) ResourceConfigurator(edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator) DatastoreException(org.cogcomp.DatastoreException) InvalidPortException(io.minio.errors.InvalidPortException) InvalidEndpointException(io.minio.errors.InvalidEndpointException) FileInputStream(java.io.FileInputStream) StringTokenizer(java.util.StringTokenizer) Datastore(org.cogcomp.Datastore) THashMap(gnu.trove.map.hash.THashMap) File(java.io.File) InFile(edu.illinois.cs.cogcomp.ner.IO.InFile)

Example 7 with InFile

use of edu.illinois.cs.cogcomp.ner.IO.InFile in project cogcomp-nlp by CogComp.

the class TitleTextNormalizer method init.

public static void init() {
    if (!ParametersForLbjCode.currentParameters.normalizeTitleText)
        return;
    InFile in = new InFile(pathToBrownClusterForWordFrequencies);
    String line = in.readLine();
    lowercasedToNormalizedTokensMap = new HashMap<>();
    HashMap<String, Integer> normalizedTokenCounts = new HashMap<>();
    while (line != null) {
        StringTokenizer st = new StringTokenizer(line);
        String path = st.nextToken();
        String word = st.nextToken();
        int occ = Integer.parseInt(st.nextToken());
        if (lowercasedToNormalizedTokensMap.containsKey(word.toLowerCase())) {
            String normalizedWord = lowercasedToNormalizedTokensMap.get(word.toLowerCase());
            int prevCount = normalizedTokenCounts.get(normalizedWord);
            if (prevCount < occ) {
                lowercasedToNormalizedTokensMap.put(word.toLowerCase(), word);
                normalizedTokenCounts.put(word, occ);
            }
        } else {
            lowercasedToNormalizedTokensMap.put(word.toLowerCase(), word);
            normalizedTokenCounts.put(word, occ);
        }
        line = in.readLine();
    }
}
Also used : InFile(edu.illinois.cs.cogcomp.ner.IO.InFile) StringTokenizer(java.util.StringTokenizer) HashMap(java.util.HashMap)

Example 8 with InFile

use of edu.illinois.cs.cogcomp.ner.IO.InFile in project cogcomp-nlp by CogComp.

the class WordEmbeddings method init.

/*
     * For now, the parameter minWordAppearanceThres is not used, but I'm planning to use it like I
     * was using the word appearance thresholds on Brown Clusters
     */
public static void init(Vector<String> filenames, Vector<Integer> embeddingDimensionality, Vector<Integer> minWordAppearanceThres, Vector<Boolean> isLowecasedEmbedding, Vector<Double> normalizationConstant, Vector<NormalizationMethod> methods) {
    dimensionalitiesSum = 0;
    dimensionalities = new Vector<>();
    resources = new Vector<>();
    embeddingByResource = new Vector<>();
    isLowecasedEmbeddingByResource = new Vector<>();
    for (int resourceId = 0; resourceId < filenames.size(); resourceId++) {
        HashMap<String, double[]> embedding = new HashMap<>();
        InFile in = new InFile(ResourceUtilities.loadResource(filenames.elementAt(resourceId)));
        String line = in.readLine();
        double maxAbsValueInAnyDimension = 0;
        while (line != null) {
            StringTokenizer st = new StringTokenizer(line, " ");
            String token = st.nextToken();
            Vector<String> v = new Vector<>();
            while (st.hasMoreTokens()) v.addElement(st.nextToken());
            if (v.size() != embeddingDimensionality.elementAt(resourceId))
                throw new IllegalArgumentException("Warning: unexpected dimensionality of " + v.size() + " for token " + token);
            double[] arr = new double[v.size()];
            double maxInThisDimension = 0;
            for (int i = 0; i < arr.length; i++) {
                arr[i] = Double.parseDouble(v.elementAt(i));
                if (maxAbsValueInAnyDimension < Math.abs(arr[i]))
                    maxAbsValueInAnyDimension = Math.abs(arr[i]);
                if (maxInThisDimension < Math.abs(arr[i]))
                    maxInThisDimension = Math.abs(arr[i]);
            }
            if (maxInThisDimension > 0 && methods.elementAt(resourceId).equals(NormalizationMethod.INDEPENDENT))
                for (int i = 0; i < arr.length; i++) arr[i] = arr[i] / (normalizationConstant.elementAt(resourceId) * maxInThisDimension);
            embedding.put(token, arr);
            line = in.readLine();
        }
        in.close();
        if (maxAbsValueInAnyDimension > 0 && methods.elementAt(resourceId).equals(NormalizationMethod.OVERALL))
            for (String s : embedding.keySet()) {
                double[] arr = embedding.get(s);
                for (int j = 0; j < arr.length; j++) arr[j] = arr[j] / (normalizationConstant.elementAt(resourceId) * maxAbsValueInAnyDimension);
            }
        embeddingByResource.addElement(embedding);
        dimensionalitiesSum += embeddingDimensionality.elementAt(resourceId);
        dimensionalities.addElement(embeddingDimensionality.elementAt(resourceId));
        resources.addElement(filenames.elementAt(resourceId));
        isLowecasedEmbeddingByResource.addElement(isLowecasedEmbedding.elementAt(resourceId));
    }
}
Also used : InFile(edu.illinois.cs.cogcomp.ner.IO.InFile) StringTokenizer(java.util.StringTokenizer) HashMap(java.util.HashMap) Vector(java.util.Vector)

Example 9 with InFile

use of edu.illinois.cs.cogcomp.ner.IO.InFile in project cogcomp-nlp by CogComp.

the class DocumentCollection method addDocuments.

/*
     * This code assumes each line in a file contains a new document
     */
public void addDocuments(String filename, int classID, StopWords stops, boolean discardFirstToken, String tokenizationDelimiters) {
    InFile in = new InFile(filename);
    Vector<String> words = in.readLineTokens(tokenizationDelimiters);
    if ((discardFirstToken) && (words != null) && (words.size() > 0))
        words.removeElementAt(0);
    if (stops != null)
        words = stops.filterStopWords(words);
    while (words != null) {
        if (words.size() >= 0)
            docs.addElement(new Document(words, classID));
        words = in.readLineTokens(tokenizationDelimiters);
        if ((discardFirstToken) && (words != null) && (words.size() > 0))
            words.removeElementAt(0);
        if (stops != null)
            words = stops.filterStopWords(words);
    }
}
Also used : InFile(edu.illinois.cs.cogcomp.ner.IO.InFile)

Aggregations

InFile (edu.illinois.cs.cogcomp.ner.IO.InFile)9 File (java.io.File)3 StringTokenizer (java.util.StringTokenizer)3 HashMap (java.util.HashMap)2 Vector (java.util.Vector)2 ResourceConfigurator (edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator)1 THashMap (gnu.trove.map.hash.THashMap)1 InvalidEndpointException (io.minio.errors.InvalidEndpointException)1 InvalidPortException (io.minio.errors.InvalidPortException)1 FileInputStream (java.io.FileInputStream)1 FileNotFoundException (java.io.FileNotFoundException)1 InputStream (java.io.InputStream)1 Datastore (org.cogcomp.Datastore)1 DatastoreException (org.cogcomp.DatastoreException)1