use of com.ibm.icu.text.Transliterator in project lucene-solr by apache.
the class TestICUTransformFilter method testOptimizerSurrogate.
public void testOptimizerSurrogate() throws Exception {
// convert CJK UNIFIED IDEOGRAPH-20087 to an x
String rules = "\\U00020087 > x;";
Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
assertTrue(custom.getFilter() == null);
final KeywordTokenizer input = new KeywordTokenizer();
input.setReader(new StringReader(""));
new ICUTransformFilter(input, custom);
assertTrue(custom.getFilter().equals(new UnicodeSet("[\\U00020087]")));
}
use of com.ibm.icu.text.Transliterator in project cogcomp-nlp by CogComp.
the class Utils method readWikiData.
/**
* This reads data in the format created by the wikipedia-api project, commonly named wikidata.Language
* @param file name of file
* @param fix whether or not the names should be reordered according to edit distance.
* @return list of examples
* @throws FileNotFoundException
*/
public static List<Example> readWikiData(String file, boolean fix) throws FileNotFoundException {
List<Example> examples = new ArrayList<>();
List<String> lines = LineIO.read(file);
String id = "Any-Latin; NFD; [^\\p{Alnum}] Remove";
// id = "Any-Latin; NFD";
Transliterator t = Transliterator.getInstance(id);
HashSet<Example> unique = new HashSet<>();
int skipping = 0;
for (String line : lines) {
if (line.contains("#")) {
continue;
}
String[] parts = line.split("\t");
if (parts.length < 2) {
continue;
}
// In wikipedia data, the foreign name comes first, English second.
String foreign = parts[0].toLowerCase();
String english = parts[1].toLowerCase();
String[] ftoks = foreign.split(" ");
String[] etoks = english.split(" ");
if (ftoks.length != etoks.length) {
logger.error("Mismatching length of tokens: " + english);
skipping++;
continue;
}
// other heuristics to help clean data
if (english.contains("jr.") || english.contains("sr.") || english.contains(" of ") || english.contains(" de ") || english.contains("(") || english.contains("pope ")) {
skipping++;
// logger.debug("Skipping: " + english);
continue;
}
int numtoks = ftoks.length;
for (int i = 0; i < numtoks; i++) {
String ftrans = t.transform(ftoks[i]);
int mindist = Integer.MAX_VALUE;
String bestmatch = null;
// this is intended to help with ordering.
for (int j = 0; j < numtoks; j++) {
int d = LevensteinDistance.getLevensteinDistance(ftrans, etoks[j]);
if (d < mindist) {
// match etoks[j] with ftrans
bestmatch = etoks[j];
mindist = d;
// then take etoks[j] out of the running
}
}
// strip those pesky commas.
if (ftoks[i].endsWith(",")) {
ftoks[i] = ftoks[i].substring(0, ftoks[i].length() - 1);
}
// This version uses transliterated words as the target (cheating)
// examples.add(new Example(bestmatch, ftrans));
Example addme;
if (fix) {
// This uses the best aligned version (recommended)
addme = new Example(bestmatch, ftoks[i]);
} else {
// This assumes the file ordering is correct
addme = new Example(etoks[i], ftoks[i]);
}
examples.add(addme);
unique.add(addme);
}
}
// System.out.println(file.split("\\.")[1] + " & " + numnames + " & " + examples.size() + " & " + unique.size() + " \\\\");
logger.debug(String.format("Skipped %d lines", skipping));
return new ArrayList<>(unique);
}
use of com.ibm.icu.text.Transliterator in project lucene-solr by apache.
the class TestICUTransformFilter method testOptimizer.
public void testOptimizer() throws Exception {
// convert a's to b's and b's to c's
String rules = "a > b; b > c;";
Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
assertTrue(custom.getFilter() == null);
final KeywordTokenizer input = new KeywordTokenizer();
input.setReader(new StringReader(""));
new ICUTransformFilter(input, custom);
assertTrue(custom.getFilter().equals(new UnicodeSet("[ab]")));
}
use of com.ibm.icu.text.Transliterator in project lucene-solr by apache.
the class TestICUTransformFilter method testRandomStrings.
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
final Transliterator transform = Transliterator.getInstance("Any-Latin");
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, transform));
}
};
checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
a.close();
}
use of com.ibm.icu.text.Transliterator in project cogcomp-nlp by CogComp.
the class Utils method romanization.
public static void romanization() throws FileNotFoundException {
List<String> lines = LineIO.read("/shared/corpora/transliteration/wikidata/wikidata.Russian.fixed");
String id = "Any-Arabic; NFD; [^\\p{Alnum}] Remove";
// id = "Any-Latin; NFD";
Transliterator t = Transliterator.getInstance(id);
int jj = 0;
List<Example> examples = new ArrayList<>();
for (String line : lines) {
if (line.contains("#")) {
continue;
}
jj++;
String[] parts = line.split("\t");
if (parts.length < 2) {
continue;
}
// In wikipedia data, the foreign name comes first, English second.
String foreign = parts[0].toLowerCase();
String english = parts[1].toLowerCase();
String[] ftoks = foreign.split(" ");
String[] etoks = english.split(" ");
if (ftoks.length != etoks.length) {
logger.error("Mismatching length of tokens: " + english);
continue;
}
int numtoks = ftoks.length;
for (int i = 0; i < numtoks; i++) {
String ftrans = t.transform(ftoks[i]);
ftoks[i] = ftrans;
int mindist = Integer.MAX_VALUE;
String bestmatch = null;
for (int j = 0; j < numtoks; j++) {
int d = LevensteinDistance.getLevensteinDistance(ftrans, etoks[j]);
if (d < mindist) {
// match etoks[j] with ftrans
bestmatch = etoks[j];
mindist = d;
// then take etoks[j] out of the running
}
}
// System.out.print(ftrans + " : " + bestmatch + ", ");
examples.add(new Example(bestmatch, ftrans));
}
if (jj % 1000 == 0) {
System.out.println(jj);
}
}
System.out.println(examples.size());
Enumeration<String> tids = t.getAvailableIDs();
while (tids.hasMoreElements()) {
String e = tids.nextElement();
// System.out.println(e);
}
}
Aggregations