Search in sources :

Example 6 with ResourceConfigurator

use of edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator in project cogcomp-nlp by CogComp.

the class BrownClusters method init.

/**
     * Initialze the brown cluster data. This is a singleton, so this process is sychronized and
     * atomic with resprect to the <code>get()</code> method above.
     * @param pathsToClusterFiles the files containing the data.
     * @param thresholds
     * @param isLowercaseBrownClusters
     */
public static void init(Vector<String> pathsToClusterFiles, Vector<Integer> thresholds, Vector<Boolean> isLowercaseBrownClusters) {
    try {
        Datastore dsNoCredentials = new Datastore(new ResourceConfigurator().getDefaultConfig());
        File gazDirectory = dsNoCredentials.getDirectory("org.cogcomp.brown-clusters", "brown-clusters", 1.5, false);
        synchronized (INIT_SYNC) {
            brownclusters = new BrownClusters();
            brownclusters.isLowercaseBrownClustersByResource = new boolean[isLowercaseBrownClusters.size()];
            brownclusters.wordToPathByResource = new ArrayList<>();
            brownclusters.resources = new ArrayList<>();
            for (int i = 0; i < pathsToClusterFiles.size(); i++) {
                THashMap<String, String> h = new THashMap<>();
                // We used to access the files as resources. Now we are accessing them programmatically.
                // InFile in = new InFile(ResourceUtilities.loadResource(pathsToClusterFiles.elementAt(i)));
                InputStream is = new FileInputStream(gazDirectory.getPath() + File.separator + pathsToClusterFiles.elementAt(i));
                InFile in = new InFile(is);
                String line = in.readLine();
                int wordsAdded = 0;
                while (line != null) {
                    StringTokenizer st = new StringTokenizer(line);
                    String path = st.nextToken();
                    String word = st.nextToken();
                    int occ = Integer.parseInt(st.nextToken());
                    if (occ >= thresholds.elementAt(i)) {
                        h.put(word, path);
                        wordsAdded++;
                    }
                    line = in.readLine();
                }
                if (ParametersForLbjCode.currentParameters.debug) {
                    logger.info(wordsAdded + " words added");
                }
                brownclusters.wordToPathByResource.add(h);
                brownclusters.isLowercaseBrownClustersByResource[i] = isLowercaseBrownClusters.elementAt(i);
                brownclusters.resources.add(pathsToClusterFiles.elementAt(i));
                in.close();
            }
        }
    } catch (InvalidPortException | InvalidEndpointException | DatastoreException | FileNotFoundException e) {
        e.printStackTrace();
    }
}
Also used : InFile(edu.illinois.cs.cogcomp.ner.IO.InFile) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) FileNotFoundException(java.io.FileNotFoundException) ResourceConfigurator(edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator) DatastoreException(org.cogcomp.DatastoreException) InvalidPortException(io.minio.errors.InvalidPortException) InvalidEndpointException(io.minio.errors.InvalidEndpointException) FileInputStream(java.io.FileInputStream) StringTokenizer(java.util.StringTokenizer) Datastore(org.cogcomp.Datastore) THashMap(gnu.trove.map.hash.THashMap) File(java.io.File) InFile(edu.illinois.cs.cogcomp.ner.IO.InFile)

Example 7 with ResourceConfigurator

use of edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator in project cogcomp-nlp by CogComp.

the class TreeGazetteers method init.

/**
     * init all the gazetters, mangle each term in a variety of ways.
     *
     * @param pathToDictionaries
     * @param phrase_length the max length of the phrases we will consider.
     * @throws IOException
     */
private void init(int phrase_length, String pathToDictionaries) throws IOException {
    try {
        ArrayList<String> filenames = new ArrayList<>();
        Datastore dsNoCredentials = new Datastore(new ResourceConfigurator().getDefaultConfig());
        File gazDirectory = dsNoCredentials.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.5, false);
        // We are not loading the resources from classpath anymore. Instead we are calling them programmatically
        // InputStream stream = ResourceUtilities.loadResource(pathToDictionaries + "/gazetteers-list.txt");
        InputStream stream = new FileInputStream(gazDirectory.getPath() + File.separator + "gazetteers" + File.separator + "gazetteers-list.txt");
        BufferedReader br = new BufferedReader(new InputStreamReader(stream));
        String line;
        while ((line = br.readLine()) != null) filenames.add(line);
        // init the dictionaries.
        dictionaries = new ArrayList<>(filenames.size());
        dictionariesIgnoreCase = new ArrayList<>(filenames.size());
        GazetteerTree gaz = new GazetteerTree(phrase_length);
        GazetteerTree gazIC = new GazetteerTree(phrase_length, new StringSplitterInterface() {

            @Override
            public String[] split(String line) {
                String tmp = line.toLowerCase();
                if (tmp.equals("in") || tmp.equals("on") || tmp.equals("us") || tmp.equals("or") || tmp.equals("am"))
                    return new String[0];
                else {
                    // character tokenization for Chinese
                    if (ParametersForLbjCode.currentParameters.language == Language.Chinese) {
                        String[] chars = new String[line.length()];
                        for (int i = 0; i < line.length(); i++) chars[i] = String.valueOf(line.charAt(i));
                        return chars;
                    } else
                        return normalize(line).split("[\\s]+");
                }
            }

            @Override
            public String normalize(String term) {
                return term.toLowerCase();
            }
        });
        // for each dictionary, compile each of the gaz trees for each phrase permutation.
        for (String file : filenames) {
            String fileName = gazDirectory.getAbsolutePath() + File.separator + file;
            gaz.readDictionary(file, "", ResourceUtilities.loadResource(fileName));
            gazIC.readDictionary(file, "(IC)", ResourceUtilities.loadResource(fileName));
        }
        gaz.trimToSize();
        gazIC.trimToSize();
        dictionaries.add(gaz);
        dictionariesIgnoreCase.add(gazIC);
        if (ParametersForLbjCode.currentParameters.debug) {
            logger.info("found " + dictionaries.size() + " gazetteers");
        }
    } catch (InvalidPortException | InvalidEndpointException e) {
        e.printStackTrace();
    } catch (DatastoreException e) {
        e.printStackTrace();
    }
}
Also used : ArrayList(java.util.ArrayList) ResourceConfigurator(edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator) DatastoreException(org.cogcomp.DatastoreException) InvalidPortException(io.minio.errors.InvalidPortException) InvalidEndpointException(io.minio.errors.InvalidEndpointException) Datastore(org.cogcomp.Datastore) StringSplitterInterface(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.GazetteerTree.StringSplitterInterface)

Aggregations

ResourceConfigurator (edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator)7 Datastore (org.cogcomp.Datastore)7 File (java.io.File)4 DatastoreException (org.cogcomp.DatastoreException)4 InvalidEndpointException (io.minio.errors.InvalidEndpointException)3 InvalidPortException (io.minio.errors.InvalidPortException)3 FileNotFoundException (java.io.FileNotFoundException)2 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)1 LocalCommaClassifier (edu.illinois.cs.cogcomp.comma.lbj.LocalCommaClassifier)1 LevinVerbClassFeature (edu.illinois.cs.cogcomp.edison.features.factory.LevinVerbClassFeature)1 StringSplitterInterface (edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.GazetteerTree.StringSplitterInterface)1 InFile (edu.illinois.cs.cogcomp.ner.IO.InFile)1 THashMap (gnu.trove.map.hash.THashMap)1 FileInputStream (java.io.FileInputStream)1 InputStream (java.io.InputStream)1 ArrayList (java.util.ArrayList)1 StringTokenizer (java.util.StringTokenizer)1 GZIPInputStream (java.util.zip.GZIPInputStream)1