use of org.languagetool.Language in project languagetool by languagetool-org.
the class LanguageDetectionEval method getShortestCorrectDetection.
private int getShortestCorrectDetection(String line, Language expectedLanguage) {
totalInputs++;
String[] tokens = line.split("\\s+");
for (int i = tokens.length; i > 0; i--) {
String text = String.join(" ", Arrays.asList(tokens).subList(0, i));
Language detectedLangObj = languageIdentifier.detectLanguage(text);
String detectedLang = null;
if (detectedLangObj != null) {
detectedLang = detectedLangObj.getShortCode();
}
if (detectedLang == null && i == tokens.length) {
throw new DetectionException("Detection failed for '" + line + "', detected <null>");
} else if (detectedLang != null && !expectedLanguage.getShortCode().equals(detectedLang)) {
if (i == tokens.length) {
throw new DetectionException("Detection failed for '" + line + "', detected " + detectedLang);
} else {
int textLength = getTextLength(tokens, i + 1);
//System.out.println("TOO SHORT: " + text + " => " + detectedLang + " (" + textLength + ")");
return textLength;
}
}
}
return tokens[0].length();
}
use of org.languagetool.Language in project languagetool by languagetool-org.
the class LanguageDetectionEval method main.
public static void main(String[] args) throws IOException {
LanguageDetectionEval eval = new LanguageDetectionEval();
long startTime = System.currentTimeMillis();
for (Language language : Languages.get()) {
eval.evaluate(language);
}
long endTime = System.currentTimeMillis();
System.out.println();
System.out.println("Time: " + (endTime - startTime) + "ms");
System.out.println("Total detection failures: " + eval.totalFailures + "/" + eval.totalInputs);
}
use of org.languagetool.Language in project languagetool by languagetool-org.
the class PatternRuleMatcher method concatMatches.
/**
* Concatenates the matches, and takes care of phrases (including inflection
* using synthesis).
* @param start Position of the element as referenced by match element in the rule.
* @param index The index of the element found in the matching sentence.
* @param tokenIndex The position of the token in the AnalyzedTokenReadings array.
* @param tokens Array of AnalyzedTokenReadings
* @return @String[] Array of concatenated strings
*/
private String[] concatMatches(int start, int index, int tokenIndex, AnalyzedTokenReadings[] tokens, int nextTokenPos, List<Match> suggestionMatches) throws IOException {
String[] finalMatch;
int len = phraseLen(index);
Language language = rule.language;
if (len == 1) {
int skippedTokens = nextTokenPos - tokenIndex;
MatchState matchState = suggestionMatches.get(start).createState(language.getSynthesizer(), tokens, tokenIndex - 1, skippedTokens);
finalMatch = matchState.toFinalString(language);
if (suggestionMatches.get(start).checksSpelling() && finalMatch.length == 1 && "".equals(finalMatch[0])) {
finalMatch = new String[1];
finalMatch[0] = MISTAKE;
}
} else {
List<String[]> matchList = new ArrayList<>();
for (int i = 0; i < len; i++) {
int skippedTokens = nextTokenPos - (tokenIndex + i);
MatchState matchState = suggestionMatches.get(start).createState(language.getSynthesizer(), tokens, tokenIndex - 1 + i, skippedTokens);
matchList.add(matchState.toFinalString(language));
}
return combineLists(matchList.toArray(new String[matchList.size()][]), new String[matchList.size()], 0, language);
}
return finalMatch;
}
use of org.languagetool.Language in project languagetool by languagetool-org.
the class LanguageBuilder method makeLanguage.
/**
* Takes an XML file named <tt>rules-xx-language.xml</tt>,
* e.g. <tt>rules-de-German.xml</tt> and builds
* a Language object for that language.
*/
private static Language makeLanguage(File file, boolean isAdditional) throws IllegalAccessException, InstantiationException {
Objects.requireNonNull(file, "file cannot be null");
if (!file.getName().endsWith(".xml")) {
throw new RuleFilenameException(file);
}
String[] parts = file.getName().split("-");
boolean startsWithRules = parts[0].equals("rules");
boolean secondPartHasCorrectLength = parts.length == 3 && (parts[1].length() == "en".length() || parts[1].length() == "ast".length() || parts[1].length() == "en_US".length());
if (!startsWithRules || !secondPartHasCorrectLength) {
throw new RuleFilenameException(file);
}
//TODO: when the XML file is mergeable with
// other rules (check this in the XML Rule Loader by using rules[@integrate='add']?),
// subclass the existing language,
//and adjust the settings if any are set in the rule file default configuration set
Language newLanguage;
if (Languages.isLanguageSupported(parts[1])) {
Language baseLanguage = Languages.getLanguageForShortCode(parts[1]).getClass().newInstance();
newLanguage = new ExtendedLanguage(baseLanguage, parts[2].replace(".xml", ""), file);
} else {
newLanguage = new Language() {
@Override
public Locale getLocale() {
return new Locale(getShortCode());
}
@Override
public Contributor[] getMaintainers() {
return null;
}
@Override
public String getShortCode() {
if (parts[1].length() == 2) {
return parts[1];
}
//en as in en_US
return parts[1].split("_")[0];
}
@Override
public String[] getCountries() {
if (parts[1].length() == 2) {
return new String[] { "" };
}
//US as in en_US
return new String[] { parts[1].split("_")[1] };
}
@Override
public String getName() {
return parts[2].replace(".xml", "");
}
@Override
public List<Rule> getRelevantRules(ResourceBundle messages) {
return Collections.emptyList();
}
@Override
public List<String> getRuleFileNames() {
List<String> ruleFiles = new ArrayList<>();
ruleFiles.add(file.getAbsolutePath());
return ruleFiles;
}
@Override
public boolean isExternal() {
return isAdditional;
}
};
}
return newLanguage;
}
use of org.languagetool.Language in project languagetool by languagetool-org.
the class LanguageIdentifier method getLanguageCodes.
private static List<String> getLanguageCodes() {
List<String> langCodes = new ArrayList<>();
for (Language lang : Languages.get()) {
String langCode = lang.getShortCode();
boolean ignore = lang.isVariant() || ignoreLangCodes.contains(langCode) || externalLangCodes.contains(langCode);
if (ignore) {
continue;
}
if ("zh".equals(langCode)) {
langCodes.add("zh-CN");
langCodes.add("zh-TW");
} else {
langCodes.add(langCode);
}
}
return langCodes;
}
Aggregations