use of com.yahoo.language.detect.Detection in project vespa by vespa-engine.
the class SimpleDetectorTestCase method testEncodingGuess.
@Test
public void testEncodingGuess() {
// just some arbitrary data above 127 which is not valid as UTF-8
byte[] b = new byte[] { (byte) 196, (byte) 197, (byte) 198 };
Detection d = new SimpleDetector().detect(b, 0, b.length, null);
assertEquals(Charset.forName("ISO-8859-1"), d.getEncoding());
// a string from http://www.columbia.edu/kermit/utf8.html that says
// "I can eat glass (and it doesn't hurt me)".
b = Utf8.toBytes("\ub098\ub294 \uc720\ub9ac\ub97c \uba39\uc744 \uc218 \uc788\uc5b4\uc694. " + "\uadf8\ub798\ub3c4 \uc544\ud504\uc9c0 \uc54a\uc544\uc694");
d = new SimpleDetector().detect(b, 0, b.length, null);
assertEquals(Utf8.getCharset(), d.getEncoding());
// arbitrary ascii
b = new byte[] { 31, 32, 33 };
d = new SimpleDetector().detect(b, 0, b.length, null);
assertEquals(Charset.forName("US-ASCII"), d.getEncoding());
// character which is not valid in UTF-8
b = new byte[] { -1 };
d = new SimpleDetector().detect(b, 0, b.length, null);
assertEquals(Charset.forName("ISO-8859-1"), d.getEncoding());
// UTF-8 which requires more bytes than available
b = new byte[] { Utf8.toBytes("\u00E5")[0] };
d = new SimpleDetector().detect(b, 0, b.length, null);
assertEquals(Charset.forName("ISO-8859-1"), d.getEncoding());
}
use of com.yahoo.language.detect.Detection in project vespa by vespa-engine.
the class ExecutionContext method resolveLanguage.
public Language resolveLanguage(Linguistics linguistics) {
if (language != null && language != Language.UNKNOWN) {
return language;
}
if (linguistics == null) {
return Language.ENGLISH;
}
Detection detection = linguistics.getDetector().detect(String.valueOf(value), null);
if (detection == null) {
return Language.ENGLISH;
}
Language detected = detection.getLanguage();
if (detected == Language.UNKNOWN) {
return Language.ENGLISH;
}
return detected;
}