use of com.ibm.icu.text.RuleBasedBreakIterator in project lucene-solr by apache.
the class RBBIRuleCompiler method compile.
static void compile(File srcDir, File destDir) throws Exception {
File[] files = srcDir.listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith("rbbi");
}
});
if (files == null)
throw new IOException("Path does not exist: " + srcDir);
for (int i = 0; i < files.length; i++) {
File file = files[i];
File outputFile = new File(destDir, file.getName().replaceAll("rbbi$", "brk"));
String rules = getRules(file);
System.err.print("Compiling " + file.getName() + " to " + outputFile.getName() + ": ");
/*
* if there is a syntax error, compileRules() may succeed. the way to
* check is to try to instantiate from the string. additionally if the
* rules are invalid, you can get a useful syntax error.
*/
try {
new RuleBasedBreakIterator(rules);
} catch (IllegalArgumentException e) {
/*
* do this intentionally, so you don't get a massive stack trace
* instead, get a useful syntax error!
*/
System.err.println(e.getMessage());
System.exit(1);
}
FileOutputStream os = new FileOutputStream(outputFile);
RuleBasedBreakIterator.compileRules(rules, os);
os.close();
System.err.println(outputFile.length() + " bytes.");
}
}
use of com.ibm.icu.text.RuleBasedBreakIterator in project elasticsearch by elastic.
the class IcuTokenizerFactory method parseRules.
//parse a single RBBi rule file
private BreakIterator parseRules(String filename, Environment env) throws IOException {
final Path path = env.configFile().resolve(filename);
String rules = Files.readAllLines(path).stream().filter((v) -> v.startsWith("#") == false).collect(Collectors.joining("\n"));
return new RuleBasedBreakIterator(rules.toString());
}
use of com.ibm.icu.text.RuleBasedBreakIterator in project lucene-solr by apache.
the class DefaultICUTokenizerConfig method readBreakIterator.
private static RuleBasedBreakIterator readBreakIterator(String filename) {
InputStream is = DefaultICUTokenizerConfig.class.getResourceAsStream(filename);
try {
RuleBasedBreakIterator bi = RuleBasedBreakIterator.getInstanceFromCompiledRules(is);
is.close();
return bi;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
use of com.ibm.icu.text.RuleBasedBreakIterator in project lucene-solr by apache.
the class ICUTokenizerFactory method parseRules.
private BreakIterator parseRules(String filename, ResourceLoader loader) throws IOException {
StringBuilder rules = new StringBuilder();
InputStream rulesStream = loader.openResource(filename);
BufferedReader reader = new BufferedReader(IOUtils.getDecodingReader(rulesStream, StandardCharsets.UTF_8));
String line = null;
while ((line = reader.readLine()) != null) {
if (!line.startsWith("#"))
rules.append(line);
rules.append('\n');
}
reader.close();
return new RuleBasedBreakIterator(rules.toString());
}
Aggregations