Search in sources :

Example 1 with Transliterator

use of com.ibm.icu.text.Transliterator in project lucene-solr by apache.

the class TestICUTransformFilter method testOptimizerSurrogate.

public void testOptimizerSurrogate() throws Exception {
    // convert CJK UNIFIED IDEOGRAPH-20087 to an x
    String rules = "\\U00020087 > x;";
    Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
    assertTrue(custom.getFilter() == null);
    final KeywordTokenizer input = new KeywordTokenizer();
    input.setReader(new StringReader(""));
    new ICUTransformFilter(input, custom);
    assertTrue(custom.getFilter().equals(new UnicodeSet("[\\U00020087]")));
}
Also used : StringReader(java.io.StringReader) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) UnicodeSet(com.ibm.icu.text.UnicodeSet) Transliterator(com.ibm.icu.text.Transliterator)

Example 2 with Transliterator

use of com.ibm.icu.text.Transliterator in project cogcomp-nlp by CogComp.

the class Utils method readWikiData.

/**
 * This reads data in the format created by the wikipedia-api project, commonly named wikidata.Language
 * @param file name of file
 * @param fix whether or not the names should be reordered according to edit distance.
 * @return list of examples
 * @throws FileNotFoundException
 */
public static List<Example> readWikiData(String file, boolean fix) throws FileNotFoundException {
    List<Example> examples = new ArrayList<>();
    List<String> lines = LineIO.read(file);
    String id = "Any-Latin; NFD; [^\\p{Alnum}] Remove";
    // id = "Any-Latin; NFD";
    Transliterator t = Transliterator.getInstance(id);
    HashSet<Example> unique = new HashSet<>();
    int skipping = 0;
    for (String line : lines) {
        if (line.contains("#")) {
            continue;
        }
        String[] parts = line.split("\t");
        if (parts.length < 2) {
            continue;
        }
        // In wikipedia data, the foreign name comes first, English second.
        String foreign = parts[0].toLowerCase();
        String english = parts[1].toLowerCase();
        String[] ftoks = foreign.split(" ");
        String[] etoks = english.split(" ");
        if (ftoks.length != etoks.length) {
            logger.error("Mismatching length of tokens: " + english);
            skipping++;
            continue;
        }
        // other heuristics to help clean data
        if (english.contains("jr.") || english.contains("sr.") || english.contains(" of ") || english.contains(" de ") || english.contains("(") || english.contains("pope ")) {
            skipping++;
            // logger.debug("Skipping: " + english);
            continue;
        }
        int numtoks = ftoks.length;
        for (int i = 0; i < numtoks; i++) {
            String ftrans = t.transform(ftoks[i]);
            int mindist = Integer.MAX_VALUE;
            String bestmatch = null;
            // this is intended to help with ordering.
            for (int j = 0; j < numtoks; j++) {
                int d = LevensteinDistance.getLevensteinDistance(ftrans, etoks[j]);
                if (d < mindist) {
                    // match etoks[j] with ftrans
                    bestmatch = etoks[j];
                    mindist = d;
                // then take etoks[j] out of the running
                }
            }
            // strip those pesky commas.
            if (ftoks[i].endsWith(",")) {
                ftoks[i] = ftoks[i].substring(0, ftoks[i].length() - 1);
            }
            // This version uses transliterated words as the target (cheating)
            // examples.add(new Example(bestmatch, ftrans));
            Example addme;
            if (fix) {
                // This uses the best aligned version (recommended)
                addme = new Example(bestmatch, ftoks[i]);
            } else {
                // This assumes the file ordering is correct
                addme = new Example(etoks[i], ftoks[i]);
            }
            examples.add(addme);
            unique.add(addme);
        }
    }
    // System.out.println(file.split("\\.")[1] + " & " + numnames + " & " + examples.size() + " & " + unique.size() + " \\\\");
    logger.debug(String.format("Skipped %d lines", skipping));
    return new ArrayList<>(unique);
}
Also used : MultiExample(edu.illinois.cs.cogcomp.transliteration.MultiExample) Example(edu.illinois.cs.cogcomp.transliteration.Example) Transliterator(com.ibm.icu.text.Transliterator)

Example 3 with Transliterator

use of com.ibm.icu.text.Transliterator in project lucene-solr by apache.

the class TestICUTransformFilter method testOptimizer.

public void testOptimizer() throws Exception {
    // convert a's to b's and b's to c's
    String rules = "a > b; b > c;";
    Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
    assertTrue(custom.getFilter() == null);
    final KeywordTokenizer input = new KeywordTokenizer();
    input.setReader(new StringReader(""));
    new ICUTransformFilter(input, custom);
    assertTrue(custom.getFilter().equals(new UnicodeSet("[ab]")));
}
Also used : StringReader(java.io.StringReader) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) UnicodeSet(com.ibm.icu.text.UnicodeSet) Transliterator(com.ibm.icu.text.Transliterator)

Example 4 with Transliterator

use of com.ibm.icu.text.Transliterator in project lucene-solr by apache.

the class TestICUTransformFilter method testRandomStrings.

/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
    final Transliterator transform = Transliterator.getInstance("Any-Latin");
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, transform));
        }
    };
    checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
    a.close();
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Analyzer(org.apache.lucene.analysis.Analyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Transliterator(com.ibm.icu.text.Transliterator)

Example 5 with Transliterator

use of com.ibm.icu.text.Transliterator in project cogcomp-nlp by CogComp.

the class Utils method romanization.

public static void romanization() throws FileNotFoundException {
    List<String> lines = LineIO.read("/shared/corpora/transliteration/wikidata/wikidata.Russian.fixed");
    String id = "Any-Arabic; NFD; [^\\p{Alnum}] Remove";
    // id = "Any-Latin; NFD";
    Transliterator t = Transliterator.getInstance(id);
    int jj = 0;
    List<Example> examples = new ArrayList<>();
    for (String line : lines) {
        if (line.contains("#")) {
            continue;
        }
        jj++;
        String[] parts = line.split("\t");
        if (parts.length < 2) {
            continue;
        }
        // In wikipedia data, the foreign name comes first, English second.
        String foreign = parts[0].toLowerCase();
        String english = parts[1].toLowerCase();
        String[] ftoks = foreign.split(" ");
        String[] etoks = english.split(" ");
        if (ftoks.length != etoks.length) {
            logger.error("Mismatching length of tokens: " + english);
            continue;
        }
        int numtoks = ftoks.length;
        for (int i = 0; i < numtoks; i++) {
            String ftrans = t.transform(ftoks[i]);
            ftoks[i] = ftrans;
            int mindist = Integer.MAX_VALUE;
            String bestmatch = null;
            for (int j = 0; j < numtoks; j++) {
                int d = LevensteinDistance.getLevensteinDistance(ftrans, etoks[j]);
                if (d < mindist) {
                    // match etoks[j] with ftrans
                    bestmatch = etoks[j];
                    mindist = d;
                // then take etoks[j] out of the running
                }
            }
            // System.out.print(ftrans + " : " + bestmatch + ", ");
            examples.add(new Example(bestmatch, ftrans));
        }
        if (jj % 1000 == 0) {
            System.out.println(jj);
        }
    }
    System.out.println(examples.size());
    Enumeration<String> tids = t.getAvailableIDs();
    while (tids.hasMoreElements()) {
        String e = tids.nextElement();
    // System.out.println(e);
    }
}
Also used : MultiExample(edu.illinois.cs.cogcomp.transliteration.MultiExample) Example(edu.illinois.cs.cogcomp.transliteration.Example) Transliterator(com.ibm.icu.text.Transliterator)

Aggregations

Transliterator (com.ibm.icu.text.Transliterator)5 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)3 UnicodeSet (com.ibm.icu.text.UnicodeSet)2 Example (edu.illinois.cs.cogcomp.transliteration.Example)2 MultiExample (edu.illinois.cs.cogcomp.transliteration.MultiExample)2 StringReader (java.io.StringReader)2 Analyzer (org.apache.lucene.analysis.Analyzer)1 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)1 Tokenizer (org.apache.lucene.analysis.Tokenizer)1