use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class GermanTagger method tag.
public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens, boolean ignoreCase) throws IOException {
initializeIfRequired();
boolean firstWord = true;
List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
int pos = 0;
for (String word : sentenceTokens) {
List<AnalyzedToken> l = new ArrayList<>();
List<TaggedWord> taggerTokens = getWordTagger().tag(word);
if (firstWord && taggerTokens.isEmpty() && ignoreCase) {
// e.g. "Das" -> "das" at start of sentence
taggerTokens = getWordTagger().tag(word.toLowerCase());
firstWord = word.matches("^\\W?$");
} else if (pos == 0 && ignoreCase) {
// "Haben", "Sollen", "Können", "Gerade" etc. at start of sentence
taggerTokens.addAll(getWordTagger().tag(word.toLowerCase()));
}
if (taggerTokens.size() > 0) {
l.addAll(getAnalyzedTokens(taggerTokens, word));
} else {
// word not known, try to decompose it and use the last part for POS tagging:
if (!StringTools.isEmpty(word.trim())) {
List<String> compoundParts = compoundTokenizer.tokenize(word);
if (compoundParts.size() <= 1) {
l.add(getNoInfoToken(word));
} else {
// last part governs a word's POS:
String lastPart = compoundParts.get(compoundParts.size() - 1);
if (StringTools.startsWithUppercase(word)) {
lastPart = StringTools.uppercaseFirstChar(lastPart);
}
List<TaggedWord> partTaggerTokens = getWordTagger().tag(lastPart);
if (partTaggerTokens.size() > 0) {
l.addAll(getAnalyzedTokens(partTaggerTokens, word, compoundParts));
} else {
l.add(getNoInfoToken(word));
}
}
} else {
l.add(getNoInfoToken(word));
}
}
tokenReadings.add(new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[l.size()]), pos));
pos += word.length();
}
return tokenReadings;
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class AccentuationDataLoader method loadWords.
Map<String, AnalyzedTokenReadings> loadWords(String path) {
final Map<String, AnalyzedTokenReadings> map = new HashMap<>();
final InputStream inputStream = JLanguageTool.getDataBroker().getFromRulesDirAsStream(path);
try (Scanner scanner = new Scanner(inputStream, FILE_ENCODING)) {
while (scanner.hasNextLine()) {
final String line = scanner.nextLine().trim();
if (line.isEmpty() || line.charAt(0) == '#') {
// ignore comments
continue;
}
final String[] parts = line.split(";");
if (parts.length != 3) {
throw new RuntimeException("Format error in file " + path + ", line: " + line + ", " + "expected 3 semicolon-separated parts, got " + parts.length);
}
final AnalyzedToken analyzedToken = new AnalyzedToken(parts[1], parts[2], null);
map.put(parts[0], new AnalyzedTokenReadings(analyzedToken, 0));
}
}
return map;
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class MorfologikCatalanSpellerRule method matchPostagRegexp.
/**
* Match POS tag with regular expression
*/
private boolean matchPostagRegexp(AnalyzedTokenReadings aToken, Pattern pattern) {
for (AnalyzedToken analyzedToken : aToken) {
String posTag = analyzedToken.getPOSTag();
if (posTag == null) {
posTag = "UNKNOWN";
}
final Matcher m = pattern.matcher(posTag);
if (m.matches()) {
return true;
}
}
return false;
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class ReplaceOperationNamesRule method matchPostagRegexp.
/**
* Match POS tag with regular expression
*/
private boolean matchPostagRegexp(AnalyzedTokenReadings aToken, Pattern pattern) {
boolean matches = false;
for (AnalyzedToken analyzedToken : aToken) {
String posTag = analyzedToken.getPOSTag();
if (posTag == null) {
posTag = "UNKNOWN";
}
final Matcher m = pattern.matcher(posTag);
if (m.matches()) {
matches = true;
break;
}
}
return matches;
}
use of org.languagetool.AnalyzedToken in project languagetool by languagetool-org.
the class CatalanTagger method additionalTags.
@Nullable
protected List<AnalyzedToken> additionalTags(String word, IStemmer stemmer) {
final IStemmer dictLookup = new DictionaryLookup(getDictionary());
List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
//Adjectiu femení singular o participi femení singular + -ment
if (word.endsWith("ment")) {
final String lowerWord = word.toLowerCase(conversionLocale);
final String possibleAdj = lowerWord.replaceAll("^(.+)ment$", "$1");
List<AnalyzedToken> taggerTokens;
taggerTokens = asAnalyzedTokenList(possibleAdj, dictLookup.lookup(possibleAdj));
for (AnalyzedToken taggerToken : taggerTokens) {
final String posTag = taggerToken.getPOSTag();
if (posTag != null) {
final Matcher m = ADJ_PART_FS.matcher(posTag);
if (m.matches()) {
additionalTaggedTokens.add(new AnalyzedToken(word, "RG", lowerWord));
return additionalTaggedTokens;
}
}
}
}
//Any well-formed verb with prefixes is tagged as a verb copying the original tags
Matcher matcher = PREFIXES_FOR_VERBS.matcher(word);
if (matcher.matches()) {
final String possibleVerb = matcher.group(2).toLowerCase();
List<AnalyzedToken> taggerTokens;
taggerTokens = asAnalyzedTokenList(possibleVerb, dictLookup.lookup(possibleVerb));
for (AnalyzedToken taggerToken : taggerTokens) {
final String posTag = taggerToken.getPOSTag();
if (posTag != null) {
final Matcher m = VERB.matcher(posTag);
if (m.matches()) {
String lemma = matcher.group(1).toLowerCase().concat(taggerToken.getLemma());
additionalTaggedTokens.add(new AnalyzedToken(word, posTag, lemma));
}
}
}
return additionalTaggedTokens;
}
// U+0140 LATIN SMALL LETTER L WITH MIDDLE DOT
if (word.contains("ŀ") || word.contains("Ŀ")) {
final String lowerWord = word.toLowerCase(conversionLocale);
final String possibleWord = lowerWord.replaceAll("ŀ", "l·");
List<AnalyzedToken> taggerTokens = asAnalyzedTokenList(word, dictLookup.lookup(possibleWord));
return taggerTokens;
}
return null;
}
Aggregations