Examples with StringSplitterInterface - edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.GazetteerTree.StringSplitterInterface

Example 1 with StringSplitterInterface

use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.GazetteerTree.StringSplitterInterface in project cogcomp-nlp by CogComp.

the class TreeGazetteers method init.

/**
     * init all the gazetters, mangle each term in a variety of ways.
     *
     * @param pathToDictionaries
     * @param phrase_length the max length of the phrases we will consider.
     * @throws IOException
     */
private void init(int phrase_length, String pathToDictionaries) throws IOException {
    try {
        ArrayList<String> filenames = new ArrayList<>();
        Datastore dsNoCredentials = new Datastore(new ResourceConfigurator().getDefaultConfig());
        File gazDirectory = dsNoCredentials.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.5, false);
        // We are not loading the resources from classpath anymore. Instead we are calling them programmatically
        // InputStream stream = ResourceUtilities.loadResource(pathToDictionaries + "/gazetteers-list.txt");
        InputStream stream = new FileInputStream(gazDirectory.getPath() + File.separator + "gazetteers" + File.separator + "gazetteers-list.txt");
        BufferedReader br = new BufferedReader(new InputStreamReader(stream));
        String line;
        while ((line = br.readLine()) != null) filenames.add(line);
        // init the dictionaries.
        dictionaries = new ArrayList<>(filenames.size());
        dictionariesIgnoreCase = new ArrayList<>(filenames.size());
        GazetteerTree gaz = new GazetteerTree(phrase_length);
        GazetteerTree gazIC = new GazetteerTree(phrase_length, new StringSplitterInterface() {

            @Override
            public String[] split(String line) {
                String tmp = line.toLowerCase();
                if (tmp.equals("in") || tmp.equals("on") || tmp.equals("us") || tmp.equals("or") || tmp.equals("am"))
                    return new String[0];
                else {
                    // character tokenization for Chinese
                    if (ParametersForLbjCode.currentParameters.language == Language.Chinese) {
                        String[] chars = new String[line.length()];
                        for (int i = 0; i < line.length(); i++) chars[i] = String.valueOf(line.charAt(i));
                        return chars;
                    } else
                        return normalize(line).split("[\\s]+");
                }
            }

            @Override
            public String normalize(String term) {
                return term.toLowerCase();
            }
        });
        // for each dictionary, compile each of the gaz trees for each phrase permutation.
        for (String file : filenames) {
            String fileName = gazDirectory.getAbsolutePath() + File.separator + file;
            gaz.readDictionary(file, "", ResourceUtilities.loadResource(fileName));
            gazIC.readDictionary(file, "(IC)", ResourceUtilities.loadResource(fileName));
        }
        gaz.trimToSize();
        gazIC.trimToSize();
        dictionaries.add(gaz);
        dictionariesIgnoreCase.add(gazIC);
        if (ParametersForLbjCode.currentParameters.debug) {
            logger.info("found " + dictionaries.size() + " gazetteers");
        }
    } catch (InvalidPortException | InvalidEndpointException e) {
        e.printStackTrace();
    } catch (DatastoreException e) {
        e.printStackTrace();
    }
}

Also used : ArrayList(java.util.ArrayList) ResourceConfigurator(edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator) DatastoreException(org.cogcomp.DatastoreException) InvalidPortException(io.minio.errors.InvalidPortException) InvalidEndpointException(io.minio.errors.InvalidEndpointException) Datastore(org.cogcomp.Datastore) StringSplitterInterface(edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.GazetteerTree.StringSplitterInterface)

Example 2 with StringSplitterInterface

use of edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.GazetteerTree.StringSplitterInterface in project cogcomp-nlp by CogComp.

the class TreeGazetteers method init.

/**
 * init all the gazetters, mangle each term in a variety of ways.
 *
 * @param pathToDictionaries the path to the gazetteers.
 * @param phrase_length the max length of the phrases we will consider.
 * @throws IOException
 */
private void init(int phrase_length, String pathToDictionaries, final Language language) throws IOException {
    try {
        // check the local file system for it.
        File gazDirectory = new File(pathToDictionaries);
        String pathToLists = gazDirectory.getPath() + File.separator + "gazetteers" + File.separator + "gazetteers-list.txt";
        InputStream stream = ResourceUtilities.loadResource(pathToLists);
        if (stream == null) {
            logger.info("Loading gazetteers from \"" + pathToLists + "\" using the Minio cache.");
            // not in file system or classpath, try Minio.
            Datastore dsNoCredentials = new Datastore(new ResourceConfigurator().getDefaultConfig());
            gazDirectory = dsNoCredentials.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.6, false);
            stream = new FileInputStream(gazDirectory.getPath() + File.separator + "gazetteers" + File.separator + "gazetteers-list.txt");
        } else {
            logger.info("Loading gazetteers from \"" + pathToLists + "\" from the local file system.");
        }
        BufferedReader br = new BufferedReader(new InputStreamReader(stream));
        String line;
        ArrayList<String> filenames = new ArrayList<>();
        while ((line = br.readLine()) != null) filenames.add(line);
        // init the dictionaries.
        dictionaries = new ArrayList<>(filenames.size());
        dictionariesIgnoreCase = new ArrayList<>(filenames.size());
        GazetteerTree gaz = new GazetteerTree(phrase_length, new StringSplitterInterface() {

            @Override
            public String[] split(String line) {
                // character tokenization for Chinese
                if (language == Language.Chinese) {
                    String[] chars = new String[line.length()];
                    for (int i = 0; i < line.length(); i++) chars[i] = String.valueOf(line.charAt(i));
                    return chars;
                } else
                    return line.split("[\\s]+");
            }

            @Override
            public final String normalize(String term) {
                return term;
            }
        });
        GazetteerTree gazIC = new GazetteerTree(phrase_length, new StringSplitterInterface() {

            @Override
            public String[] split(String line) {
                String tmp = line.toLowerCase();
                if (tmp.equals("in") || tmp.equals("on") || tmp.equals("us") || tmp.equals("or") || tmp.equals("am"))
                    return new String[0];
                else {
                    // character tokenization for Chinese
                    if (language == Language.Chinese) {
                        String[] chars = new String[line.length()];
                        for (int i = 0; i < line.length(); i++) chars[i] = String.valueOf(line.charAt(i));
                        return chars;
                    } else
                        return normalize(line).split("[\\s]+");
                }
            }

            @Override
            public String normalize(String term) {
                return term.toLowerCase();
            }
        });
        // for each dictionary, compile each of the gaz trees for each phrase permutation.
        for (String file : filenames) {
            String fileName = gazDirectory.getAbsolutePath() + File.separator + file;
            gaz.readDictionary(file, "", ResourceUtilities.loadResource(fileName));
            gazIC.readDictionary(file, "(IC)", ResourceUtilities.loadResource(fileName));
        }
        gaz.trimToSize();
        gazIC.trimToSize();
        dictionaries.add(gaz);
        dictionariesIgnoreCase.add(gazIC);
        logger.info("Gazetteers from \"" + pathToLists + "\" are loaded.");
    } catch (InvalidPortException | InvalidEndpointException e) {
        e.printStackTrace();
    } catch (DatastoreException e) {
        e.printStackTrace();
    }
}

Aggregations

ResourceConfigurator (edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator)2 StringSplitterInterface (edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.GazetteerTree.StringSplitterInterface)2 InvalidEndpointException (io.minio.errors.InvalidEndpointException)2 InvalidPortException (io.minio.errors.InvalidPortException)2 ArrayList (java.util.ArrayList)2 Datastore (org.cogcomp.Datastore)2 DatastoreException (org.cogcomp.DatastoreException)2